[wistia] Add extractor

This commit is contained in:
Philipp Hagemeister 2013-12-06 09:15:04 +01:00
parent 72135030d1
commit ef4fd84857
4 changed files with 80 additions and 10 deletions

View File

@ -488,7 +488,8 @@ class YoutubeDL(object):
new_result = ie_result.copy() new_result = ie_result.copy()
for f in ('_type', 'url', 'ext', 'player_url', 'formats', for f in ('_type', 'url', 'ext', 'player_url', 'formats',
'entries', 'urlhandle', 'ie_key', 'duration', 'entries', 'urlhandle', 'ie_key', 'duration',
'subtitles', 'annotations', 'format'): 'subtitles', 'annotations', 'format',
'thumbnail', 'thumbnails'):
if f in new_result: if f in new_result:
del new_result[f] del new_result[f]
if f in embedded_info: if f in embedded_info:

View File

@ -178,6 +178,7 @@ from .wat import WatIE
from .websurg import WeBSurgIE from .websurg import WeBSurgIE
from .weibo import WeiboIE from .weibo import WeiboIE
from .wimp import WimpIE from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE from .xhamster import XHamsterIE
from .xnxx import XNXXIE from .xnxx import XNXXIE

View File

@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title # Site Name | Video Title
# Video Title - Tagline | Site Name # Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical # and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>', video_title = self._html_search_regex(
webpage, u'video title', default=u'video', flags=re.DOTALL) r'(?s)<title>(.*?)</title>', webpage, u'video title',
default=u'video')
# video uploader is domain name
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
# Look for BrightCove: # Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage) bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@ -188,7 +193,7 @@ class GenericIE(InfoExtractor):
# Look for embedded YouTube player # Look for embedded YouTube player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
if matches: if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
for tuppl in matches] for tuppl in matches]
@ -197,13 +202,26 @@ class GenericIE(InfoExtractor):
# Look for embedded Dailymotion player # Look for embedded Dailymotion player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
if matches: if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
for tuppl in matches] for tuppl in matches]
return self.playlist_result( return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title) urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for embedded Wistia player
match = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
return {
'_type': 'url_transparent',
'url': unescapeHTML(match.group('url')),
'ie_key': 'Wistia',
'uploader': video_uploader,
'title': video_title,
'id': video_id,
}
# Look for Bandcamp pages with custom domain # Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None: if mobj is not None:
@ -247,14 +265,9 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you: # here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0] video_id = os.path.splitext(video_id)[0]
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
return { return {
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': None,
'title': video_title, 'title': video_title,
} }

View File

@ -0,0 +1,55 @@
import json
import re
from .common import InfoExtractor
class WistiaIE(InfoExtractor):
_VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
_TEST = {
u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
u"file": u"sh7fpupwlt.mov",
u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
u"info_dict": {
u"title": u"cfh_resourceful_zdkh_final_1"
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
data_json = self._html_search_regex(
r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
data = json.loads(data_json)
formats = []
thumbnails = []
for atype, a in data['assets'].items():
if atype == 'still':
thumbnails.append({
'url': a['url'],
'resolution': '%dx%d' % (a['width'], a['height']),
})
continue
if atype == 'preview':
continue
formats.append({
'format_id': atype,
'url': a['url'],
'width': a['width'],
'height': a['height'],
'filesize': a['size'],
'ext': a['ext'],
})
formats.sort(key=lambda a: a['filesize'])
return {
'id': video_id,
'title': data['name'],
'formats': formats,
'thumbnails': thumbnails,
}