[utils] Use a regexp instead of HTMLParser for get_element_by_attribute

This commit is contained in:
Philipp Hagemeister 2014-11-04 23:33:43 +01:00
parent 11fba1751d
commit 3828505646
1 changed files with 15 additions and 104 deletions

View File

@ -152,86 +152,6 @@ def xpath_text(node, xpath, name=None, fatal=False):
return n.text return n.text
if sys.version_info < (2, 7):
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class BaseHTMLParser(compat_html_parser.HTMLParser):
def __init(self):
compat_html_parser.HTMLParser.__init__(self)
self.html = None
def loads(self, html):
self.html = html
self.feed(html)
self.close()
class AttrParser(BaseHTMLParser):
"""Modified HTMLParser that isolates a tag with the specified attribute"""
def __init__(self, attribute, value):
self.attribute = attribute
self.value = value
self.result = None
self.started = False
self.depth = {}
self.watch_startpos = False
self.error_count = 0
BaseHTMLParser.__init__(self)
def error(self, message):
if self.error_count > 10 or self.started:
raise compat_html_parser.HTMLParseError(message, self.getpos())
self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
self.error_count += 1
self.goahead(1)
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
if self.attribute in attrs and attrs[self.attribute] == self.value:
self.result = [tag]
self.started = True
self.watch_startpos = True
if self.started:
if not tag in self.depth: self.depth[tag] = 0
self.depth[tag] += 1
def handle_endtag(self, tag):
if self.started:
if tag in self.depth: self.depth[tag] -= 1
if self.depth[self.result[0]] == 0:
self.started = False
self.result.append(self.getpos())
def find_startpos(self, x):
"""Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id"""
if self.watch_startpos:
self.watch_startpos = False
self.result.append(self.getpos())
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result(self):
if self.result is None:
return None
if len(self.result) != 3:
return None
lines = self.html.split('\n')
lines = lines[self.result[1][0]-1:self.result[2][0]]
lines[0] = lines[0][self.result[1][1]:]
if len(lines) == 1:
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
lines[-1] = lines[-1][:self.result[2][1]]
return '\n'.join(lines).strip()
# Hack for https://github.com/rg3/youtube-dl/issues/662
if sys.version_info < (2, 7, 3):
AttrParser.parse_endtag = (lambda self, i:
i + len("</scr'+'ipt>")
if self.rawdata[i:].startswith("</scr'+'ipt>")
else compat_html_parser.HTMLParser.parse_endtag(self, i))
def get_element_by_id(id, html): def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document""" """Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html) return get_element_by_attribute("id", id, html)
@ -239,34 +159,25 @@ def get_element_by_id(id, html):
def get_element_by_attribute(attribute, value, html): def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
parser = AttrParser(attribute, value)
try:
parser.loads(html)
except compat_html_parser.HTMLParseError:
pass
return parser.get_result()
class MetaParser(BaseHTMLParser): m = re.search(r'''(?xs)
""" <([a-zA-Z0-9:._-]+)
Modified HTMLParser that isolates a meta tag with the specified name (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
attribute. \s+%s=['"]?%s['"]?
""" (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
def __init__(self, name): \s*>
BaseHTMLParser.__init__(self) (?P<content>.*?)
self.name = name </\1>
self.content = None ''' % (re.escape(attribute), re.escape(value)), html)
self.result = None
def handle_starttag(self, tag, attrs): if not m:
if tag != 'meta': return None
return res = m.group('content')
attrs = dict(attrs)
if attrs.get('name') == self.name:
self.result = attrs.get('content')
def get_result(self): if res.startswith('"') or res.startswith("'"):
return self.result res = res[1:-1]
return unescapeHTML(res)
def clean_html(html): def clean_html(html):