[crunchyroll] parse vilos media data(closes #17343)

This commit is contained in:
Remita Amine 2018-09-01 08:16:28 +01:00
parent ed6919e737
commit 54a5be4dba
2 changed files with 141 additions and 112 deletions

View file

@ -7,7 +7,7 @@ import zlib
from hashlib import sha1 from hashlib import sha1
from math import pow, sqrt, floor from math import pow, sqrt, floor
from .common import InfoExtractor from .vrv import VRVIE
from ..compat import ( from ..compat import (
compat_b64decode, compat_b64decode,
compat_etree_fromstring, compat_etree_fromstring,
@ -18,6 +18,8 @@ from ..compat import (
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
bytes_to_intlist, bytes_to_intlist,
extract_attributes,
float_or_none,
intlist_to_bytes, intlist_to_bytes,
int_or_none, int_or_none,
lowercase_escape, lowercase_escape,
@ -26,14 +28,13 @@ from ..utils import (
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
xpath_text, xpath_text,
extract_attributes,
) )
from ..aes import ( from ..aes import (
aes_cbc_decrypt, aes_cbc_decrypt,
) )
class CrunchyrollBaseIE(InfoExtractor): class CrunchyrollBaseIE(VRVIE):
_LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_URL = 'https://www.crunchyroll.com/login'
_LOGIN_FORM = 'login_form' _LOGIN_FORM = 'login_form'
_NETRC_MACHINE = 'crunchyroll' _NETRC_MACHINE = 'crunchyroll'
@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Wanna be the Strongest in the World Episode 1 An Idol-Wrestler is Born!', 'title': 'Wanna be the Strongest in the World Episode 1 An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef', 'description': 'md5:2d17137920c64f2f49981a7797d275ef',
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013', 'upload_date': '20131013',
'url': 're:(?!.*&amp)', 'url': 're:(?!.*&amp)',
@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': { 'info_dict': {
'id': '535080', 'id': '535080',
'ext': 'mp4', 'ext': 'mp4',
'title': '11eyes Episode 1 Piros éjszaka - Red Night', 'title': '11eyes Episode 1 Red Night ~ Piros éjszaka',
'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
'uploader': 'Marvelous AQL Inc.', 'uploader': 'Marvelous AQL Inc.',
'upload_date': '20091021', 'upload_date': '20091021',
@ -437,13 +438,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
if 'To view this, please log in to verify you are 18 or older.' in webpage: if 'To view this, please log in to verify you are 18 or older.' in webpage:
self.raise_login_required() self.raise_login_required()
media = self._parse_json(self._search_regex(
r'vilos\.config\.media\s*=\s*({.+?});',
webpage, 'vilos media', default='{}'), video_id)
media_metadata = media.get('metadata') or {}
video_title = self._html_search_regex( video_title = self._html_search_regex(
r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
webpage, 'video_title') webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title) video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._parse_json(self._html_search_regex( video_description = (self._parse_json(self._html_search_regex(
r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
webpage, 'description', default='{}'), video_id).get('description') webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
if video_description: if video_description:
video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_upload_date = self._html_search_regex( video_upload_date = self._html_search_regex(
@ -456,91 +462,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
[r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
webpage, 'video_uploader', fatal=False) webpage, 'video_uploader', fatal=False)
available_fmts = []
for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
attrs = extract_attributes(a)
href = attrs.get('href')
if href and '/freetrial' in href:
continue
available_fmts.append(fmt)
if not available_fmts:
for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
available_fmts = re.findall(p, webpage)
if available_fmts:
break
video_encode_ids = []
formats = [] formats = []
for fmt in available_fmts: for stream in media.get('streams', []):
stream_quality, stream_format = self._FORMAT_IDS[fmt] formats.extend(self._extract_vrv_formats(
video_format = fmt + 'p' stream.get('url'), video_id, stream.get('format'),
stream_infos = [] stream.get('audio_lang'), stream.get('hardsub_lang')))
streamdata = self._call_rpc_api( if not formats:
'VideoPlayer_GetStandardConfig', video_id, available_fmts = []
'Downloading media info for %s' % video_format, data={ for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
'media_id': video_id, attrs = extract_attributes(a)
'video_format': stream_format, href = attrs.get('href')
'video_quality': stream_quality, if href and '/freetrial' in href:
'current_page': url, continue
}) available_fmts.append(fmt)
if streamdata is not None: if not available_fmts:
stream_info = streamdata.find('./{default}preload/stream_info') for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
available_fmts = re.findall(p, webpage)
if available_fmts:
break
if not available_fmts:
available_fmts = self._FORMAT_IDS.keys()
video_encode_ids = []
for fmt in available_fmts:
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
stream_infos = []
streamdata = self._call_rpc_api(
'VideoPlayer_GetStandardConfig', video_id,
'Downloading media info for %s' % video_format, data={
'media_id': video_id,
'video_format': stream_format,
'video_quality': stream_quality,
'current_page': url,
})
if streamdata is not None:
stream_info = streamdata.find('./{default}preload/stream_info')
if stream_info is not None:
stream_infos.append(stream_info)
stream_info = self._call_rpc_api(
'VideoEncode_GetStreamInfo', video_id,
'Downloading stream info for %s' % video_format, data={
'media_id': video_id,
'video_format': stream_format,
'video_encode_quality': stream_quality,
})
if stream_info is not None: if stream_info is not None:
stream_infos.append(stream_info) stream_infos.append(stream_info)
stream_info = self._call_rpc_api( for stream_info in stream_infos:
'VideoEncode_GetStreamInfo', video_id, video_encode_id = xpath_text(stream_info, './video_encode_id')
'Downloading stream info for %s' % video_format, data={ if video_encode_id in video_encode_ids:
'media_id': video_id, continue
'video_format': stream_format, video_encode_ids.append(video_encode_id)
'video_encode_quality': stream_quality,
})
if stream_info is not None:
stream_infos.append(stream_info)
for stream_info in stream_infos:
video_encode_id = xpath_text(stream_info, './video_encode_id')
if video_encode_id in video_encode_ids:
continue
video_encode_ids.append(video_encode_id)
video_file = xpath_text(stream_info, './file') video_file = xpath_text(stream_info, './file')
if not video_file: if not video_file:
continue continue
if video_file.startswith('http'): if video_file.startswith('http'):
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
video_file, video_id, 'mp4', entry_protocol='m3u8_native', video_file, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)) m3u8_id='hls', fatal=False))
continue
video_url = xpath_text(stream_info, './host')
if not video_url:
continue
metadata = stream_info.find('./metadata')
format_info = {
'format': video_format,
'height': int_or_none(xpath_text(metadata, './height')),
'width': int_or_none(xpath_text(metadata, './width')),
}
if '.fplive.net/' in video_url:
video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
parsed_video_url = compat_urlparse.urlparse(video_url)
direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
netloc='v.lvlt.crcdn.net',
path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
if self._is_valid_url(direct_video_url, video_id, video_format):
format_info.update({
'format_id': 'http-' + video_format,
'url': direct_video_url,
})
formats.append(format_info)
continue continue
format_info.update({ video_url = xpath_text(stream_info, './host')
'format_id': 'rtmp-' + video_format, if not video_url:
'url': video_url, continue
'play_path': video_file, metadata = stream_info.find('./metadata')
'ext': 'flv', format_info = {
}) 'format': video_format,
formats.append(format_info) 'height': int_or_none(xpath_text(metadata, './height')),
'width': int_or_none(xpath_text(metadata, './width')),
}
if '.fplive.net/' in video_url:
video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
parsed_video_url = compat_urlparse.urlparse(video_url)
direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
netloc='v.lvlt.crcdn.net',
path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
if self._is_valid_url(direct_video_url, video_id, video_format):
format_info.update({
'format_id': 'http-' + video_format,
'url': direct_video_url,
})
formats.append(format_info)
continue
format_info.update({
'format_id': 'rtmp-' + video_format,
'url': video_url,
'play_path': video_file,
'ext': 'flv',
})
formats.append(format_info)
self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
metadata = self._call_rpc_api( metadata = self._call_rpc_api(
@ -549,7 +563,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'media_id': video_id, 'media_id': video_id,
}) })
subtitles = self.extract_subtitles(video_id, webpage) subtitles = {}
for subtitle in media.get('subtitles', []):
subtitle_url = subtitle.get('url')
if not subtitle_url:
continue
subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
'url': subtitle_url,
'ext': subtitle.get('format', 'ass'),
})
if not subtitles:
subtitles = self.extract_subtitles(video_id, webpage)
# webpage provide more accurate data than series_title from XML # webpage provide more accurate data than series_title from XML
series = self._html_search_regex( series = self._html_search_regex(
@ -557,8 +581,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
webpage, 'series', fatal=False) webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title') season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title') episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number')) episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
season_number = int_or_none(self._search_regex( season_number = int_or_none(self._search_regex(
r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@ -568,7 +592,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'id': video_id, 'id': video_id,
'title': video_title, 'title': video_title,
'description': video_description, 'description': video_description,
'thumbnail': xpath_text(metadata, 'episode_image_url'), 'duration': float_or_none(media_metadata.get('duration'), 1000),
'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': video_upload_date, 'upload_date': video_upload_date,
'series': series, 'series': series,

View file

@ -72,7 +72,7 @@ class VRVBaseIE(InfoExtractor):
class VRVIE(VRVBaseIE): class VRVIE(VRVBaseIE):
IE_NAME = 'vrv' IE_NAME = 'vrv'
_VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
_TEST = { _TESTS = [{
'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
'info_dict': { 'info_dict': {
'id': 'GR9PNZ396', 'id': 'GR9PNZ396',
@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
} }]
def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
if not url or stream_format not in ('hls', 'dash'):
return []
stream_id = hardsub_lang or audio_lang
format_id = '%s-%s' % (stream_format, stream_id)
if stream_format == 'hls':
adaptive_formats = self._extract_m3u8_formats(
url, video_id, 'mp4', m3u8_id=format_id,
note='Downloading %s m3u8 information' % stream_id,
fatal=False)
elif stream_format == 'dash':
adaptive_formats = self._extract_mpd_formats(
url, video_id, mpd_id=format_id,
note='Downloading %s MPD information' % stream_id,
fatal=False)
if audio_lang:
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_lang
return adaptive_formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -115,26 +136,9 @@ class VRVIE(VRVBaseIE):
for stream_type, streams in streams_json.get('streams', {}).items(): for stream_type, streams in streams_json.get('streams', {}).items():
if stream_type in ('adaptive_hls', 'adaptive_dash'): if stream_type in ('adaptive_hls', 'adaptive_dash'):
for stream in streams.values(): for stream in streams.values():
stream_url = stream.get('url') formats.extend(self._extract_vrv_formats(
if not stream_url: stream.get('url'), video_id, stream_type.split('_')[1],
continue audio_locale, stream.get('hardsub_locale')))
stream_id = stream.get('hardsub_locale') or audio_locale
format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
if stream_type == 'adaptive_hls':
adaptive_formats = self._extract_m3u8_formats(
stream_url, video_id, 'mp4', m3u8_id=format_id,
note='Downloading %s m3u8 information' % stream_id,
fatal=False)
else:
adaptive_formats = self._extract_mpd_formats(
stream_url, video_id, mpd_id=format_id,
note='Downloading %s MPD information' % stream_id,
fatal=False)
if audio_locale:
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_locale
formats.extend(adaptive_formats)
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}