Changeset 157 for table_parser
- Timestamp:
- 21/11/10 02:15:17 (18 months ago)
- File:
-
- 1 edited
-
table_parser/table_parser.py (modified) (6 diffs)
Legend:
- Unmodified
- Added
- Removed
-
table_parser/table_parser.py
r156 r157 26 26 pass 27 27 28 class Cell(object): 29 def __init__(self): 30 self.data = None 31 return 28 class Cell(list): 29 """The cell holds components of text and HTML anchors""" 32 30 def append(self,item): 33 if self.data != None: 34 print "Overwriting %s"%self.data 35 self.data = item 31 s = super(Cell, self) 32 if isinstance(item, str): 33 if len(self) == 0 or isinstance(top(self), str): 34 if len(self) > 0: 35 i = len(self)-1 36 self[i] = self[i] + item 37 else: 38 s.append(item) 39 elif isinstance(top(self), Anchor): 40 s.append(item) 41 else: 42 s.append(item) 43 44 class Anchor(object): 45 """HTML Anchor""" 46 def __init__(self, href): 47 self.href = href 48 self.text = None 49 def append(self, text): 50 self.text = text 51 def __str__(self): 52 return '<a href="%s">%s</a>'%(self.href, self.text) 36 53 37 54 # Get the item on the top of a stack … … 40 57 41 58 class TableParser(HTMLParser): 42 def __init__(self, parser=None ):59 def __init__(self, parser=None, anchors=True): 43 60 """ 44 61 The parser is a method which will be passed the doc at the end … … 54 71 self._parser = parser 55 72 self.reset() 73 self.anchors = anchors 56 74 return 57 75 … … 70 88 self._tag = tag 71 89 self._attrs = attrs 90 attrs = dict(attrs) 72 91 if lower(tag) == 'table': 73 92 self._buf = '' … … 79 98 self._buf = '' 80 99 self._stack.append(Cell()) 100 elif lower(tag) == 'a' and self.anchors and isinstance(top(self._stack), Cell): 101 # add the text we already have 102 top(self._stack).append(self._buf) 103 self._buf = '' 104 self._stack.append(Anchor(attrs['href'])) 81 105 82 106 #print "Encountered the beginning of a %s tag" % tag … … 103 127 t = top(self._stack) 104 128 if isinstance(t, Row): 105 # We can not currently have text and other table elements in the same cell. 106 # Table elements get precedence 107 if c.data == None: 108 t.append(self._buf) 109 else: 110 t.append(c.data) 129 c.append(self._buf) 130 t.append(c) 111 131 else: 112 132 print "Cell not in a row, rather in a %s"%t 133 elif lower(tag) == 'a' and self.anchors and isinstance(top(self._stack), Anchor): 134 a = None 135 while not isinstance(a, Anchor): 136 a = self._stack.pop() 137 c = top(self._stack) 138 if isinstance(c, Cell): 139 a.append(self._buf) 140 self._buf = '' 141 c.append(a) 142 else: 143 print "anchor should be in a cell" 113 144 self._tag = None 114 145 #print "Encountered the end of a %s tag" % tag
Note: See TracChangeset
for help on using the changeset viewer.
