From 2b14cb566fde3e5482ce9a63b2be7103cec939e0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 28 Jan 2016 12:38:34 +0100 Subject: [PATCH] [utils] fix dfxp2srt text extraction(fixes #8055) --- youtube_dl/utils.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c63b61598..18dbe28bb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data): 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', }) + class TTMLPElementParser: + out = '' + + def start(self, tag, attrib): + if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): + self.out += '\n' + + def end(self, tag): + pass + + def data(self, data): + self.out += data + + def close(self): + return self.out.strip() + def parse_node(node): - str_or_empty = functools.partial(str_or_none, default='') - - out = str_or_empty(node.text) - - for child in node: - if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - out += '\n' + str_or_empty(child.tail) - elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): - out += str_or_empty(parse_node(child)) - else: - out += str_or_empty(xml.etree.ElementTree.tostring(child)) - - return out + target = TTMLPElementParser() + parser = xml.etree.ElementTree.XMLParser(target=target) + parser.feed(xml.etree.ElementTree.tostring(node)) + return parser.close() dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = []