[youtube:feed] Implement lazy playlist extraction (closes #10184)

This commit is contained in:
Sergey M․ 2018-04-22 06:07:32 +07:00
parent 6cdaaf7031
commit 3853309fe2
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -2699,10 +2699,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _real_extract(self, url): def _entries(self, page):
page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
# The extraction process is the same as for playlists, but the regex # The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index # for the video ids doesn't contain an index
ids = [] ids = []
@ -2713,12 +2710,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
# 'recommended' feed has infinite 'load more' and each new portion spins # 'recommended' feed has infinite 'load more' and each new portion spins
# the same videos in (sometimes) slightly different order, so we'll check # the same videos in (sometimes) slightly different order, so we'll check
# for unicity and break when portion has no new videos # for unicity and break when portion has no new videos
new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
if not new_ids: if not new_ids:
break break
ids.extend(new_ids) ids.extend(new_ids)
for entry in self._ids_to_results(new_ids):
yield entry
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj: if not mobj:
break break
@ -2730,8 +2730,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
content_html = more['content_html'] content_html = more['content_html']
more_widget_html = more['load_more_widget_html'] more_widget_html = more['load_more_widget_html']
def _real_extract(self, url):
page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE)
return self.playlist_result( return self.playlist_result(
self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) self._entries(page), playlist_title=self._PLAYLIST_TITLE)
class YoutubeWatchLaterIE(YoutubePlaylistIE): class YoutubeWatchLaterIE(YoutubePlaylistIE):