Added extractor for crunchyroll 'playlists' i.e. series. so that one can, e.g. download all episodes of a series

This commit is contained in:
Gabriel Schubiner 2014-10-19 22:47:05 -07:00
parent cc98a3f096
commit 8230018c20
2 changed files with 39 additions and 1 deletions

View file

@ -60,7 +60,10 @@ from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .cracked import CrackedIE from .cracked import CrackedIE
from .criterion import CriterionIE from .criterion import CriterionIE
from .crunchyroll import CrunchyrollIE from .crunchyroll import (
CrunchyrollIE,
CrunchyrollShowPlaylistIE
)
from .cspan import CSpanIE from .cspan import CSpanIE
from .d8 import D8IE from .d8 import D8IE
from .dailymotion import ( from .dailymotion import (

View file

@ -24,6 +24,7 @@ from ..aes import (
aes_cbc_decrypt, aes_cbc_decrypt,
inc, inc,
) )
from .common import InfoExtractor
class CrunchyrollIE(SubtitlesInfoExtractor): class CrunchyrollIE(SubtitlesInfoExtractor):
@ -285,3 +286,37 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'subtitles': subtitles, 'subtitles': subtitles,
'formats': formats, 'formats': formats,
} }
class CrunchyrollShowPlaylistIE(InfoExtractor):
IE_NAME = "crunchyroll:playlist"
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<show>[\w\-]+))/?$'
_TITLE_EXTR = r'<span\s+itemprop="name">\s*(?P<showtitle>[\w\s]+)'
_TESTS = [{
'url' : 'http://www.crunchyroll.com/attack-on-titan',
'info_dict' : {
'title' : 'Attack on Titan'
},
'playlist_count' : 15
}]
def _extract_title_entries(self,id,webpage):
_EPISODE_ID_EXTR = r'id="showview_videos_media_(?P<vidid>\d+)".*?href="/{0}/(?P<vidurl>[\w\-]+-(?P=vidid))"'.format(id)
title = self._html_search_regex(self._TITLE_EXTR,webpage,"title",flags=re.UNICODE|re.MULTILINE)
episode_urls = [self.url_result('http://www.crunchyroll.com/{0}/{1}'.format(id, showmatch[1])) for
showmatch in re.findall(_EPISODE_ID_EXTR, webpage,re.UNICODE|re.MULTILINE|re.DOTALL)]
return title, episode_urls
def _real_extract(self, url):
url_match = re.match(self._VALID_URL,url)
show_id = url_match.group('show')
webpage = self._download_webpage(url,show_id)
(title,entries) = self._extract_title_entries(show_id,webpage)
return {
'_type' : 'playlist',
'id' : show_id,
'title' : title,
'entries' : entries
}