| 1 | # A simple HTML table parser. It turns tables (including nested tables) into arrays |
|---|
| 2 | # This program is free software: you can redistribute it and/or modify |
|---|
| 3 | # it under the terms of the GNU General Public License as published by |
|---|
| 4 | # the Free Software Foundation, either version 3 of the License, or |
|---|
| 5 | # (at your option) any later version. |
|---|
| 6 | # |
|---|
| 7 | # This program is distributed in the hope that it will be useful, |
|---|
| 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 10 | # GNU General Public License for more details. |
|---|
| 11 | # |
|---|
| 12 | # You should have received a copy of the GNU General Public License |
|---|
| 13 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
|---|
| 14 | # |
|---|
| 15 | # Nigel Sim <nigel.sim@gmail.com> |
|---|
| 16 | # http://simbot.wordpress.com |
|---|
| 17 | # http://projects.nigelsim.org |
|---|
| 18 | from HTMLParser import HTMLParser |
|---|
| 19 | import re, string, os |
|---|
| 20 | from string import lower |
|---|
| 21 | |
|---|
| 22 | class Table(list): |
|---|
| 23 | pass |
|---|
| 24 | |
|---|
| 25 | class Row(list): |
|---|
| 26 | pass |
|---|
| 27 | |
|---|
| 28 | class Cell(list): |
|---|
| 29 | """The cell holds components of text and HTML anchors""" |
|---|
| 30 | def append(self,item): |
|---|
| 31 | s = super(Cell, self) |
|---|
| 32 | if isinstance(item, str): |
|---|
| 33 | if len(self) == 0 or isinstance(top(self), str): |
|---|
| 34 | if len(self) > 0: |
|---|
| 35 | i = len(self)-1 |
|---|
| 36 | self[i] = self[i] + item |
|---|
| 37 | else: |
|---|
| 38 | s.append(item) |
|---|
| 39 | elif isinstance(top(self), Anchor): |
|---|
| 40 | s.append(item) |
|---|
| 41 | else: |
|---|
| 42 | s.append(item) |
|---|
| 43 | |
|---|
| 44 | class Anchor(object): |
|---|
| 45 | """HTML Anchor""" |
|---|
| 46 | def __init__(self, href): |
|---|
| 47 | self.href = href |
|---|
| 48 | self.text = None |
|---|
| 49 | def append(self, text): |
|---|
| 50 | self.text = text |
|---|
| 51 | def __str__(self): |
|---|
| 52 | return '<a href="%s">%s</a>'%(self.href, self.text) |
|---|
| 53 | def __repr__(self): |
|---|
| 54 | """Possible not the best repr""" |
|---|
| 55 | return '{"href": "%s", "text": "%s"}'%(self.href, self.text) |
|---|
| 56 | |
|---|
| 57 | class Image(object): |
|---|
| 58 | def __init__(self, src): |
|---|
| 59 | self.src = src |
|---|
| 60 | def __repr__(self): |
|---|
| 61 | return '{"src": "%s"}'%(self.src) |
|---|
| 62 | |
|---|
| 63 | # Get the item on the top of a stack |
|---|
| 64 | def top(x): |
|---|
| 65 | return x[len(x)-1] |
|---|
| 66 | |
|---|
| 67 | class TableParser(HTMLParser): |
|---|
| 68 | def __init__(self, parser=None, anchors=True): |
|---|
| 69 | """ |
|---|
| 70 | The parser is a method which will be passed the doc at the end |
|---|
| 71 | of the parsing. Useful if TableParser is within an inner loop and |
|---|
| 72 | you want to automatically process the document. If it is omitted then |
|---|
| 73 | it will do nothing |
|---|
| 74 | """ |
|---|
| 75 | self._tag = None |
|---|
| 76 | self._buf = None |
|---|
| 77 | self._attrs = None |
|---|
| 78 | self.doc = None # Where the document will be stored |
|---|
| 79 | self._stack = None |
|---|
| 80 | self._parser = parser |
|---|
| 81 | self.reset() |
|---|
| 82 | self.anchors = anchors |
|---|
| 83 | return |
|---|
| 84 | |
|---|
| 85 | def reset(self): |
|---|
| 86 | HTMLParser.reset(self) |
|---|
| 87 | self.doc = [] |
|---|
| 88 | self._stack = [self.doc] |
|---|
| 89 | self._buf = '' |
|---|
| 90 | |
|---|
| 91 | def close(self): |
|---|
| 92 | HTMLParser.close(self) |
|---|
| 93 | if self._parser != None: |
|---|
| 94 | self._parser(self.doc) |
|---|
| 95 | |
|---|
| 96 | def handle_starttag(self, tag, attrs): |
|---|
| 97 | self._tag = tag |
|---|
| 98 | self._attrs = attrs |
|---|
| 99 | attrs = dict(attrs) |
|---|
| 100 | if lower(tag) == 'table': |
|---|
| 101 | self._buf = '' |
|---|
| 102 | self._stack.append(Table()) |
|---|
| 103 | elif lower(tag) == 'tr': |
|---|
| 104 | self._buf = '' |
|---|
| 105 | self._stack.append(Row()) |
|---|
| 106 | elif lower(tag) == 'td': |
|---|
| 107 | self._buf = '' |
|---|
| 108 | self._stack.append(Cell()) |
|---|
| 109 | elif lower(tag) == 'a' and self.anchors and isinstance(top(self._stack), Cell): |
|---|
| 110 | # add the text we already have |
|---|
| 111 | if len(self._buf) > 0: |
|---|
| 112 | top(self._stack).append(self._buf) |
|---|
| 113 | self._buf = '' |
|---|
| 114 | self._stack.append(Anchor(attrs['href'])) |
|---|
| 115 | elif lower(tag) == 'img' and self.anchors and isinstance(top(self._stack), Cell): |
|---|
| 116 | # add the text we already have |
|---|
| 117 | if len(self._buf) > 0: |
|---|
| 118 | top(self._stack).append(self._buf) |
|---|
| 119 | self._buf = '' |
|---|
| 120 | top(self._stack).append(Image(attrs['src'])) |
|---|
| 121 | |
|---|
| 122 | #print "Encountered the beginning of a %s tag" % tag |
|---|
| 123 | |
|---|
| 124 | def handle_endtag(self, tag): |
|---|
| 125 | if lower(tag) == 'table': |
|---|
| 126 | t = None |
|---|
| 127 | while not isinstance(t, Table): |
|---|
| 128 | t = self._stack.pop() |
|---|
| 129 | r = top(self._stack) |
|---|
| 130 | r.append(t) |
|---|
| 131 | |
|---|
| 132 | elif lower(tag) == 'tr': |
|---|
| 133 | t = None |
|---|
| 134 | while not isinstance(t, Row): |
|---|
| 135 | t = self._stack.pop() |
|---|
| 136 | r = top(self._stack) |
|---|
| 137 | r.append(t) |
|---|
| 138 | |
|---|
| 139 | elif lower(tag) == 'td': |
|---|
| 140 | c = None |
|---|
| 141 | while not isinstance(c, Cell): |
|---|
| 142 | c = self._stack.pop() |
|---|
| 143 | t = top(self._stack) |
|---|
| 144 | if isinstance(t, Row): |
|---|
| 145 | if len(self._buf) > 0: |
|---|
| 146 | c.append(self._buf) |
|---|
| 147 | t.append(c) |
|---|
| 148 | else: |
|---|
| 149 | print "Cell not in a row, rather in a %s"%t |
|---|
| 150 | elif lower(tag) == 'a' and self.anchors and isinstance(top(self._stack), Anchor): |
|---|
| 151 | a = None |
|---|
| 152 | while not isinstance(a, Anchor): |
|---|
| 153 | a = self._stack.pop() |
|---|
| 154 | c = top(self._stack) |
|---|
| 155 | if isinstance(c, Cell): |
|---|
| 156 | a.append(self._buf) |
|---|
| 157 | self._buf = '' |
|---|
| 158 | c.append(a) |
|---|
| 159 | else: |
|---|
| 160 | print "anchor should be in a cell" |
|---|
| 161 | self._tag = None |
|---|
| 162 | #print "Encountered the end of a %s tag" % tag |
|---|
| 163 | |
|---|
| 164 | def handle_data(self, data): |
|---|
| 165 | self._buf += data |
|---|
| 166 | |
|---|
| 167 | |
|---|
| 168 | if __name__ == '__main__': |
|---|
| 169 | import sys |
|---|
| 170 | if len(sys.argv) == 2 and sys.argv[1] == '-t': |
|---|
| 171 | # Run the doctests |
|---|
| 172 | import doctest |
|---|
| 173 | doctest.testmod() |
|---|
| 174 | elif len(sys.argv) == 2: |
|---|
| 175 | # Parse the URL provided |
|---|
| 176 | import pprint |
|---|
| 177 | import urllib2 |
|---|
| 178 | f = urllib2.urlopen(sys.argv[1]) |
|---|
| 179 | tp = TableParser() |
|---|
| 180 | tp.feed(f.read()) |
|---|
| 181 | f.close() |
|---|
| 182 | pprint.pprint(tp.doc) |
|---|
| 183 | else: |
|---|
| 184 | # Print usage |
|---|
| 185 | print "run doctests: <script> -t" |
|---|
| 186 | print "parse URL: <script> <URL>" |
|---|