Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

This commit is contained in:
Sergey M․ 2018-03-18 02:46:50 +07:00
parent e0d198c18d
commit 47a5cb7734
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
3 changed files with 52 additions and 36 deletions

View File

@ -698,40 +698,47 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
_TEST_CASES = [ _TEST_CASES = [
( (
'foo_xspf', 'foo_xspf',
'https://example.org/src/', 'https://example.org/src/foo_xspf.xspf',
[{ [{
'id': 'foo_xspf',
'title': 'Pandemonium',
'description': 'Visit http://bigbrother404.bandcamp.com', 'description': 'Visit http://bigbrother404.bandcamp.com',
'duration': 202.416, 'duration': 202.416,
'formats': [{'url': 'https://example.org/src/cd1/track%201.mp3'}], 'formats': [{
'manifest_url': 'https://example.org/src/foo_xspf.xspf',
'url': 'https://example.org/src/cd1/track%201.mp3',
}],
}, {
'id': 'foo_xspf', 'id': 'foo_xspf',
'title': 'Pandemonium' 'title': 'Final Cartridge (Nichico Twelve Remix)',
},
{
'description': 'Visit http://bigbrother404.bandcamp.com', 'description': 'Visit http://bigbrother404.bandcamp.com',
'duration': 255.857, 'duration': 255.857,
'formats': [{'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3'}], 'formats': [{
'manifest_url': 'https://example.org/src/foo_xspf.xspf',
'url': 'https://example.org/%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF%E3%80%80%EF%BC%92.mp3',
}],
}, {
'id': 'foo_xspf', 'id': 'foo_xspf',
'title': 'Final Cartridge (Nichico Twelve Remix)' 'title': 'Rebuilding Nightingale',
},
{
'description': 'Visit http://bigbrother404.bandcamp.com', 'description': 'Visit http://bigbrother404.bandcamp.com',
'duration': 287.915, 'duration': 287.915,
'formats': [ 'formats': [{
{'url': 'https://example.org/src/track3.mp3'}, 'manifest_url': 'https://example.org/src/foo_xspf.xspf',
{'url': 'https://example.com/track3.mp3'} 'url': 'https://example.org/src/track3.mp3',
], }, {
'id': 'foo_xspf', 'manifest_url': 'https://example.org/src/foo_xspf.xspf',
'title': 'Rebuilding Nightingale' 'url': 'https://example.com/track3.mp3',
}]
}] }]
), ),
] ]
for xspf_file, xspf_base_url, expected_entries in _TEST_CASES: for xspf_file, xspf_url, expected_entries in _TEST_CASES:
with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, with io.open('./test/testdata/xspf/%s.xspf' % xspf_file,
mode='r', encoding='utf-8') as f: mode='r', encoding='utf-8') as f:
entries = self.ie._parse_xspf( entries = self.ie._parse_xspf(
compat_etree_fromstring(f.read().encode('utf-8')), compat_etree_fromstring(f.read().encode('utf-8')),
xspf_file, xspf_base_url) xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url)
expect_value(self, entries, expected_entries, None) expect_value(self, entries, expected_entries, None)
for i in range(len(entries)): for i in range(len(entries)):
expect_dict(self, entries[i], expected_entries[i]) expect_dict(self, entries[i], expected_entries[i])

View File

@ -1706,22 +1706,24 @@ class InfoExtractor(object):
}) })
return subtitles return subtitles
def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
xspf = self._download_xml( xspf = self._download_xml(
playlist_url, playlist_id, 'Downloading xpsf playlist', xspf_url, playlist_id, 'Downloading xpsf playlist',
'Unable to download xspf manifest', fatal=fatal) 'Unable to download xspf manifest', fatal=fatal)
if xspf is False: if xspf is False:
return [] return []
return self._parse_xspf(xspf, playlist_id, base_url(playlist_url)) return self._parse_xspf(
xspf, playlist_id, xspf_url=xspf_url,
xspf_base_url=base_url(xspf_url))
def _parse_xspf(self, playlist, playlist_id, playlist_base_url=''): def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
NS_MAP = { NS_MAP = {
'xspf': 'http://xspf.org/ns/0/', 'xspf': 'http://xspf.org/ns/0/',
's1': 'http://static.streamone.nl/player/ns/0', 's1': 'http://static.streamone.nl/player/ns/0',
} }
entries = [] entries = []
for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
title = xpath_text( title = xpath_text(
track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
description = xpath_text( description = xpath_text(
@ -1731,12 +1733,18 @@ class InfoExtractor(object):
duration = float_or_none( duration = float_or_none(
xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
formats = [{ formats = []
'url': urljoin(playlist_base_url, location.text), for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), format_url = urljoin(xspf_base_url, location.text)
'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), if not format_url:
'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), continue
} for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] formats.append({
'url': format_url,
'manifest_url': xspf_url,
'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
})
self._sort_formats(formats) self._sort_formats(formats)
entries.append({ entries.append({
@ -1750,18 +1758,18 @@ class InfoExtractor(object):
return entries return entries
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
res = self._download_webpage_handle( res = self._download_xml_handle(
mpd_url, video_id, mpd_url, video_id,
note=note or 'Downloading MPD manifest', note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest', errnote=errnote or 'Failed to download MPD manifest',
fatal=fatal) fatal=fatal)
if res is False: if res is False:
return [] return []
mpd, urlh = res mpd_doc, urlh = res
mpd_base_url = base_url(urlh.geturl()) mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats( return self._parse_mpd_formats(
compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
formats_dict=formats_dict, mpd_url=mpd_url) formats_dict=formats_dict, mpd_url=mpd_url)
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
@ -2035,17 +2043,16 @@ class InfoExtractor(object):
return formats return formats
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
res = self._download_webpage_handle( res = self._download_xml_handle(
ism_url, video_id, ism_url, video_id,
note=note or 'Downloading ISM manifest', note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest', errnote=errnote or 'Failed to download ISM manifest',
fatal=fatal) fatal=fatal)
if res is False: if res is False:
return [] return []
ism, urlh = res ism_doc, urlh = res
return self._parse_ism_formats( return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
""" """

View File

@ -2233,7 +2233,9 @@ class GenericIE(InfoExtractor):
return smil return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist': elif doc.tag == '{http://xspf.org/ns/0/}playlist':
return self.playlist_result( return self.playlist_result(
self._parse_xspf(doc, video_id, compat_str(full_response.geturl())), self._parse_xspf(
doc, video_id, xspf_url=url,
xspf_base_url=compat_str(full_response.geturl())),
video_id) video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats( info_dict['formats'] = self._parse_mpd_formats(