From 99a4eac6e0a541a2ab54eb156c5516e088e54e9e Mon Sep 17 00:00:00 2001 From: Jens Timmerman Date: Sat, 24 Oct 2020 01:26:12 +0200 Subject: [PATCH] added latest youtube-dlc release --- AUTHORS | 248 + ChangeLog | 5294 +++++++++++++++ LICENSE | 24 + MANIFEST.in | 9 + PKG-INFO | 33 + README.md | 487 ++ README.txt | 529 ++ docs/Makefile | 177 + docs/conf.py | 71 + docs/index.rst | 23 + docs/module_guide.rst | 67 + setup.cfg | 11 + setup.py | 103 + test/__init__.py | 0 test/helper.py | 282 + test/parameters.json | 43 + test/swftests/.gitignore | 1 + test/swftests/ArrayAccess.as | 19 + test/swftests/ClassCall.as | 17 + test/swftests/ClassConstruction.as | 15 + test/swftests/ConstArrayAccess.as | 18 + test/swftests/ConstantInt.as | 12 + test/swftests/DictCall.as | 10 + test/swftests/EqualsOperator.as | 10 + test/swftests/LocalVars.as | 13 + test/swftests/MemberAssignment.as | 22 + test/swftests/NeOperator.as | 24 + test/swftests/PrivateCall.as | 21 + test/swftests/PrivateVoidCall.as | 22 + test/swftests/StaticAssignment.as | 13 + test/swftests/StaticRetrieval.as | 16 + test/swftests/StringBasics.as | 11 + test/swftests/StringCharCodeAt.as | 11 + test/swftests/StringConversion.as | 11 + test/test_InfoExtractor.py | 1071 +++ test/test_YoutubeDL.py | 924 +++ test/test_YoutubeDLCookieJar.py | 51 + test/test_aes.py | 63 + test/test_age_restriction.py | 50 + test/test_all_urls.py | 137 + test/test_cache.py | 59 + test/test_compat.py | 126 + test/test_download.py | 265 + test/test_downloader_http.py | 115 + test/test_execution.py | 44 + test/test_http.py | 166 + test/test_iqiyi_sdk_interpreter.py | 48 + test/test_jsinterp.py | 117 + test/test_netrc.py | 26 + test/test_options.py | 26 + test/test_postprocessors.py | 17 + test/test_socks.py | 118 + test/test_subtitles.py | 351 + test/test_swfinterp.py | 80 + test/test_unicode_literals.py | 65 + test/test_update.py | 30 + test/test_utils.py | 1442 +++++ test/test_verbose_output.py | 71 + test/test_write_annotations.py | 80 + test/test_youtube_chapters.py | 275 + test/test_youtube_lists.py | 71 + test/test_youtube_signature.py | 145 + test/testcert.pem | 52 + test/testdata/cookies/httponly_cookies.txt | 6 + test/testdata/cookies/malformed_cookies.txt | 9 + test/testdata/cookies/session_cookies.txt | 6 + test/testdata/f4m/custom_base_url.f4m | 10 + test/testdata/m3u8/pluzz_francetv_11507.m3u8 | 14 + test/testdata/m3u8/teamcoco_11995.m3u8 | 16 + test/testdata/m3u8/ted_18923.m3u8 | 28 + test/testdata/m3u8/toggle_mobile_12211.m3u8 | 13 + test/testdata/m3u8/twitch_vod.m3u8 | 20 + test/testdata/m3u8/vidio.m3u8 | 10 + test/testdata/mpd/float_duration.mpd | 18 + test/testdata/mpd/unfragmented.mpd | 28 + test/testdata/mpd/urls_only.mpd | 218 + test/testdata/xspf/foo_xspf.xspf | 34 + test/versions.json | 34 + youtube-dlc.1 | 947 +++ youtube-dlc.bash-completion | 29 + youtube-dlc.fish | 174 + youtube_dlc.egg-info/PKG-INFO | 33 + youtube_dlc.egg-info/SOURCES.txt | 890 +++ youtube_dlc.egg-info/dependency_links.txt | 1 + youtube_dlc.egg-info/entry_points.txt | 3 + youtube_dlc.egg-info/top_level.txt | 2 + youtube_dlc/YoutubeDL.py | 2489 +++++++ youtube_dlc/__init__.py | 496 ++ youtube_dlc/__main__.py | 19 + youtube_dlc/aes.py | 361 ++ youtube_dlc/cache.py | 96 + youtube_dlc/compat.py | 3050 +++++++++ youtube_dlc/downloader/__init__.py | 63 + youtube_dlc/downloader/common.py | 391 ++ youtube_dlc/downloader/dash.py | 80 + youtube_dlc/downloader/external.py | 371 ++ youtube_dlc/downloader/f4m.py | 438 ++ youtube_dlc/downloader/fragment.py | 269 + youtube_dlc/downloader/hls.py | 210 + youtube_dlc/downloader/http.py | 362 ++ youtube_dlc/downloader/ism.py | 259 + youtube_dlc/downloader/rtmp.py | 214 + youtube_dlc/downloader/rtsp.py | 47 + youtube_dlc/downloader/youtube_live_chat.py | 94 + youtube_dlc/extractor/__init__.py | 46 + youtube_dlc/extractor/abc.py | 257 + youtube_dlc/extractor/abcnews.py | 148 + youtube_dlc/extractor/abcotvs.py | 137 + youtube_dlc/extractor/academicearth.py | 41 + youtube_dlc/extractor/acast.py | 135 + youtube_dlc/extractor/adn.py | 207 + youtube_dlc/extractor/adobeconnect.py | 37 + youtube_dlc/extractor/adobepass.py | 1572 +++++ youtube_dlc/extractor/adobetv.py | 288 + youtube_dlc/extractor/adultswim.py | 202 + youtube_dlc/extractor/aenetworks.py | 247 + youtube_dlc/extractor/afreecatv.py | 367 ++ youtube_dlc/extractor/airmozilla.py | 66 + youtube_dlc/extractor/aliexpress.py | 53 + youtube_dlc/extractor/aljazeera.py | 33 + youtube_dlc/extractor/allocine.py | 132 + youtube_dlc/extractor/alphaporno.py | 77 + youtube_dlc/extractor/alura.py | 180 + youtube_dlc/extractor/amcnetworks.py | 118 + youtube_dlc/extractor/americastestkitchen.py | 82 + youtube_dlc/extractor/amp.py | 102 + youtube_dlc/extractor/animeondemand.py | 293 + youtube_dlc/extractor/anvato.py | 314 + youtube_dlc/extractor/aol.py | 133 + youtube_dlc/extractor/apa.py | 94 + youtube_dlc/extractor/aparat.py | 95 + youtube_dlc/extractor/appleconnect.py | 50 + youtube_dlc/extractor/appletrailers.py | 283 + youtube_dlc/extractor/archiveorg.py | 65 + youtube_dlc/extractor/ard.py | 574 ++ youtube_dlc/extractor/arkena.py | 133 + youtube_dlc/extractor/arte.py | 201 + youtube_dlc/extractor/asiancrush.py | 145 + youtube_dlc/extractor/atresplayer.py | 118 + youtube_dlc/extractor/atttechchannel.py | 55 + youtube_dlc/extractor/atvat.py | 75 + youtube_dlc/extractor/audimedia.py | 93 + youtube_dlc/extractor/audioboom.py | 73 + youtube_dlc/extractor/audiomack.py | 145 + youtube_dlc/extractor/awaan.py | 185 + youtube_dlc/extractor/aws.py | 78 + youtube_dlc/extractor/azmedien.py | 66 + youtube_dlc/extractor/baidu.py | 56 + youtube_dlc/extractor/bandcamp.py | 425 ++ youtube_dlc/extractor/bbc.py | 1359 ++++ youtube_dlc/extractor/beampro.py | 194 + youtube_dlc/extractor/beatport.py | 103 + youtube_dlc/extractor/beeg.py | 116 + youtube_dlc/extractor/behindkink.py | 46 + youtube_dlc/extractor/bellmedia.py | 88 + youtube_dlc/extractor/bet.py | 82 + youtube_dlc/extractor/bfi.py | 37 + youtube_dlc/extractor/bigflix.py | 78 + youtube_dlc/extractor/bild.py | 40 + youtube_dlc/extractor/bilibili.py | 450 ++ youtube_dlc/extractor/biobiochiletv.py | 86 + youtube_dlc/extractor/biqle.py | 105 + youtube_dlc/extractor/bitchute.py | 150 + youtube_dlc/extractor/bleacherreport.py | 106 + youtube_dlc/extractor/blinkx.py | 86 + youtube_dlc/extractor/bloomberg.py | 83 + youtube_dlc/extractor/bokecc.py | 60 + youtube_dlc/extractor/bostonglobe.py | 72 + youtube_dlc/extractor/bpb.py | 62 + youtube_dlc/extractor/br.py | 311 + youtube_dlc/extractor/bravotv.py | 84 + youtube_dlc/extractor/breakcom.py | 91 + youtube_dlc/extractor/brightcove.py | 677 ++ youtube_dlc/extractor/businessinsider.py | 48 + youtube_dlc/extractor/buzzfeed.py | 98 + youtube_dlc/extractor/byutv.py | 117 + youtube_dlc/extractor/c56.py | 65 + youtube_dlc/extractor/camdemy.py | 161 + youtube_dlc/extractor/cammodels.py | 98 + youtube_dlc/extractor/camtube.py | 71 + youtube_dlc/extractor/camwithher.py | 89 + youtube_dlc/extractor/canalc2.py | 73 + youtube_dlc/extractor/canalplus.py | 116 + youtube_dlc/extractor/canvas.py | 368 ++ youtube_dlc/extractor/carambatv.py | 108 + youtube_dlc/extractor/cartoonnetwork.py | 62 + youtube_dlc/extractor/cbc.py | 497 ++ youtube_dlc/extractor/cbs.py | 112 + youtube_dlc/extractor/cbsinteractive.py | 103 + youtube_dlc/extractor/cbslocal.py | 104 + youtube_dlc/extractor/cbsnews.py | 147 + youtube_dlc/extractor/cbssports.py | 38 + youtube_dlc/extractor/ccc.py | 111 + youtube_dlc/extractor/ccma.py | 109 + youtube_dlc/extractor/cctv.py | 191 + youtube_dlc/extractor/cda.py | 182 + youtube_dlc/extractor/ceskatelevize.py | 289 + youtube_dlc/extractor/channel9.py | 262 + youtube_dlc/extractor/charlierose.py | 54 + youtube_dlc/extractor/chaturbate.py | 109 + youtube_dlc/extractor/chilloutzone.py | 96 + youtube_dlc/extractor/chirbit.py | 91 + youtube_dlc/extractor/cinchcast.py | 58 + youtube_dlc/extractor/cinemax.py | 29 + youtube_dlc/extractor/ciscolive.py | 151 + youtube_dlc/extractor/cjsw.py | 72 + youtube_dlc/extractor/cliphunter.py | 79 + youtube_dlc/extractor/clippit.py | 74 + youtube_dlc/extractor/cliprs.py | 33 + youtube_dlc/extractor/clipsyndicate.py | 54 + youtube_dlc/extractor/closertotruth.py | 92 + youtube_dlc/extractor/cloudflarestream.py | 72 + youtube_dlc/extractor/cloudy.py | 60 + youtube_dlc/extractor/clubic.py | 56 + youtube_dlc/extractor/clyp.py | 82 + youtube_dlc/extractor/cmt.py | 56 + youtube_dlc/extractor/cnbc.py | 66 + youtube_dlc/extractor/cnn.py | 144 + youtube_dlc/extractor/comedycentral.py | 142 + youtube_dlc/extractor/common.py | 3023 +++++++++ youtube_dlc/extractor/commonmistakes.py | 50 + youtube_dlc/extractor/commonprotocols.py | 60 + youtube_dlc/extractor/condenast.py | 232 + youtube_dlc/extractor/contv.py | 118 + youtube_dlc/extractor/corus.py | 160 + youtube_dlc/extractor/coub.py | 140 + youtube_dlc/extractor/cracked.py | 90 + youtube_dlc/extractor/crackle.py | 200 + youtube_dlc/extractor/crooksandliars.py | 60 + youtube_dlc/extractor/crunchyroll.py | 686 ++ youtube_dlc/extractor/cspan.py | 196 + youtube_dlc/extractor/ctsnews.py | 87 + youtube_dlc/extractor/ctvnews.py | 68 + youtube_dlc/extractor/cultureunplugged.py | 70 + youtube_dlc/extractor/curiositystream.py | 161 + youtube_dlc/extractor/cwtv.py | 97 + youtube_dlc/extractor/dailymail.py | 84 + youtube_dlc/extractor/dailymotion.py | 393 ++ youtube_dlc/extractor/daum.py | 266 + youtube_dlc/extractor/dbtv.py | 57 + youtube_dlc/extractor/dctp.py | 105 + youtube_dlc/extractor/deezer.py | 147 + youtube_dlc/extractor/defense.py | 39 + youtube_dlc/extractor/democracynow.py | 96 + youtube_dlc/extractor/dfb.py | 57 + youtube_dlc/extractor/dhm.py | 59 + youtube_dlc/extractor/digg.py | 56 + youtube_dlc/extractor/digiteka.py | 112 + youtube_dlc/extractor/discovery.py | 118 + youtube_dlc/extractor/discoverygo.py | 175 + youtube_dlc/extractor/discoverynetworks.py | 40 + youtube_dlc/extractor/discoveryvr.py | 59 + youtube_dlc/extractor/disney.py | 170 + youtube_dlc/extractor/dispeak.py | 126 + youtube_dlc/extractor/dlive.py | 97 + youtube_dlc/extractor/doodstream.py | 71 + youtube_dlc/extractor/dotsub.py | 83 + youtube_dlc/extractor/douyutv.py | 201 + youtube_dlc/extractor/dplay.py | 247 + youtube_dlc/extractor/drbonanza.py | 59 + youtube_dlc/extractor/dropbox.py | 40 + youtube_dlc/extractor/drtuber.py | 112 + youtube_dlc/extractor/drtv.py | 352 + youtube_dlc/extractor/dtube.py | 83 + youtube_dlc/extractor/duboku.py | 242 + youtube_dlc/extractor/dumpert.py | 80 + youtube_dlc/extractor/dvtv.py | 184 + youtube_dlc/extractor/dw.py | 108 + youtube_dlc/extractor/eagleplatform.py | 206 + youtube_dlc/extractor/ebaumsworld.py | 33 + youtube_dlc/extractor/echomsk.py | 46 + youtube_dlc/extractor/egghead.py | 129 + youtube_dlc/extractor/ehow.py | 38 + youtube_dlc/extractor/eighttracks.py | 164 + youtube_dlc/extractor/einthusan.py | 111 + youtube_dlc/extractor/eitb.py | 88 + youtube_dlc/extractor/ellentube.py | 133 + youtube_dlc/extractor/elonet.py | 137 + youtube_dlc/extractor/elpais.py | 95 + youtube_dlc/extractor/embedly.py | 16 + youtube_dlc/extractor/engadget.py | 27 + youtube_dlc/extractor/eporner.py | 129 + youtube_dlc/extractor/eroprofile.py | 95 + youtube_dlc/extractor/escapist.py | 111 + youtube_dlc/extractor/espn.py | 238 + youtube_dlc/extractor/esri.py | 74 + youtube_dlc/extractor/europa.py | 93 + youtube_dlc/extractor/everyonesmixtape.py | 77 + youtube_dlc/extractor/expotv.py | 77 + youtube_dlc/extractor/expressen.py | 101 + youtube_dlc/extractor/extractors.py | 1541 +++++ youtube_dlc/extractor/extremetube.py | 50 + youtube_dlc/extractor/eyedotv.py | 64 + youtube_dlc/extractor/facebook.py | 514 ++ youtube_dlc/extractor/faz.py | 93 + youtube_dlc/extractor/fc2.py | 160 + youtube_dlc/extractor/fczenit.py | 56 + youtube_dlc/extractor/filmon.py | 178 + youtube_dlc/extractor/filmweb.py | 42 + youtube_dlc/extractor/firsttv.py | 156 + youtube_dlc/extractor/fivemin.py | 54 + youtube_dlc/extractor/fivetv.py | 91 + youtube_dlc/extractor/flickr.py | 116 + youtube_dlc/extractor/folketinget.py | 77 + youtube_dlc/extractor/footyroom.py | 56 + youtube_dlc/extractor/formula1.py | 33 + youtube_dlc/extractor/fourtube.py | 309 + youtube_dlc/extractor/fox.py | 150 + youtube_dlc/extractor/fox9.py | 41 + youtube_dlc/extractor/foxgay.py | 63 + youtube_dlc/extractor/foxnews.py | 127 + youtube_dlc/extractor/foxsports.py | 33 + youtube_dlc/extractor/franceculture.py | 69 + youtube_dlc/extractor/franceinter.py | 56 + youtube_dlc/extractor/francetv.py | 518 ++ youtube_dlc/extractor/freesound.py | 79 + youtube_dlc/extractor/freespeech.py | 31 + youtube_dlc/extractor/freshlive.py | 83 + youtube_dlc/extractor/frontendmasters.py | 263 + youtube_dlc/extractor/funimation.py | 154 + youtube_dlc/extractor/funk.py | 49 + youtube_dlc/extractor/fusion.py | 84 + youtube_dlc/extractor/fxnetworks.py | 77 + youtube_dlc/extractor/gaia.py | 130 + youtube_dlc/extractor/gameinformer.py | 49 + youtube_dlc/extractor/gamespot.py | 139 + youtube_dlc/extractor/gamestar.py | 65 + youtube_dlc/extractor/gaskrank.py | 101 + youtube_dlc/extractor/gazeta.py | 48 + youtube_dlc/extractor/gdcvault.py | 157 + youtube_dlc/extractor/generic.py | 3459 ++++++++++ youtube_dlc/extractor/gfycat.py | 125 + youtube_dlc/extractor/giantbomb.py | 90 + youtube_dlc/extractor/giga.py | 102 + youtube_dlc/extractor/gigya.py | 22 + youtube_dlc/extractor/glide.py | 43 + youtube_dlc/extractor/globo.py | 240 + youtube_dlc/extractor/go.py | 272 + youtube_dlc/extractor/godtube.py | 58 + youtube_dlc/extractor/golem.py | 72 + youtube_dlc/extractor/googledrive.py | 290 + youtube_dlc/extractor/googleplus.py | 73 + youtube_dlc/extractor/googlesearch.py | 59 + youtube_dlc/extractor/goshgay.py | 51 + youtube_dlc/extractor/gputechconf.py | 35 + youtube_dlc/extractor/groupon.py | 67 + youtube_dlc/extractor/hbo.py | 175 + youtube_dlc/extractor/hearthisat.py | 135 + youtube_dlc/extractor/heise.py | 172 + youtube_dlc/extractor/hellporno.py | 76 + youtube_dlc/extractor/helsinki.py | 43 + youtube_dlc/extractor/hentaistigma.py | 39 + youtube_dlc/extractor/hgtv.py | 40 + youtube_dlc/extractor/hidive.py | 118 + youtube_dlc/extractor/historicfilms.py | 47 + youtube_dlc/extractor/hitbox.py | 214 + youtube_dlc/extractor/hitrecord.py | 68 + youtube_dlc/extractor/hketv.py | 191 + youtube_dlc/extractor/hornbunny.py | 49 + youtube_dlc/extractor/hotnewhiphop.py | 66 + youtube_dlc/extractor/hotstar.py | 237 + youtube_dlc/extractor/howcast.py | 43 + youtube_dlc/extractor/howstuffworks.py | 90 + youtube_dlc/extractor/hrfensehen.py | 102 + youtube_dlc/extractor/hrti.py | 208 + youtube_dlc/extractor/huajiao.py | 56 + youtube_dlc/extractor/huffpost.py | 96 + youtube_dlc/extractor/hungama.py | 117 + youtube_dlc/extractor/hypem.py | 49 + youtube_dlc/extractor/ign.py | 232 + youtube_dlc/extractor/imdb.py | 147 + youtube_dlc/extractor/imggaming.py | 133 + youtube_dlc/extractor/imgur.py | 154 + youtube_dlc/extractor/ina.py | 83 + youtube_dlc/extractor/inc.py | 59 + youtube_dlc/extractor/indavideo.py | 128 + youtube_dlc/extractor/infoq.py | 136 + youtube_dlc/extractor/instagram.py | 428 ++ youtube_dlc/extractor/internazionale.py | 85 + youtube_dlc/extractor/internetvideoarchive.py | 64 + youtube_dlc/extractor/iprima.py | 149 + youtube_dlc/extractor/iqiyi.py | 394 ++ youtube_dlc/extractor/ir90tv.py | 42 + youtube_dlc/extractor/itv.py | 312 + youtube_dlc/extractor/ivi.py | 271 + youtube_dlc/extractor/ivideon.py | 83 + youtube_dlc/extractor/iwara.py | 99 + youtube_dlc/extractor/izlesene.py | 117 + youtube_dlc/extractor/jamendo.py | 187 + youtube_dlc/extractor/jeuxvideo.py | 56 + youtube_dlc/extractor/joj.py | 108 + youtube_dlc/extractor/jove.py | 80 + youtube_dlc/extractor/jwplatform.py | 46 + youtube_dlc/extractor/kakao.py | 136 + youtube_dlc/extractor/kaltura.py | 377 ++ youtube_dlc/extractor/kanalplay.py | 97 + youtube_dlc/extractor/kankan.py | 48 + youtube_dlc/extractor/karaoketv.py | 64 + youtube_dlc/extractor/karrierevideos.py | 99 + youtube_dlc/extractor/keezmovies.py | 133 + youtube_dlc/extractor/ketnet.py | 93 + youtube_dlc/extractor/khanacademy.py | 82 + youtube_dlc/extractor/kickstarter.py | 71 + youtube_dlc/extractor/kinja.py | 221 + youtube_dlc/extractor/kinopoisk.py | 70 + youtube_dlc/extractor/konserthusetplay.py | 124 + youtube_dlc/extractor/krasview.py | 60 + youtube_dlc/extractor/ku6.py | 32 + youtube_dlc/extractor/kusi.py | 88 + youtube_dlc/extractor/kuwo.py | 352 + youtube_dlc/extractor/la7.py | 67 + youtube_dlc/extractor/laola1tv.py | 265 + youtube_dlc/extractor/lci.py | 26 + youtube_dlc/extractor/lcp.py | 90 + youtube_dlc/extractor/lecture2go.py | 71 + youtube_dlc/extractor/lecturio.py | 243 + youtube_dlc/extractor/leeco.py | 368 ++ youtube_dlc/extractor/lego.py | 149 + youtube_dlc/extractor/lemonde.py | 58 + youtube_dlc/extractor/lenta.py | 53 + youtube_dlc/extractor/libraryofcongress.py | 153 + youtube_dlc/extractor/libsyn.py | 93 + youtube_dlc/extractor/lifenews.py | 239 + youtube_dlc/extractor/limelight.py | 358 ++ youtube_dlc/extractor/line.py | 90 + youtube_dlc/extractor/linkedin.py | 182 + youtube_dlc/extractor/linuxacademy.py | 173 + youtube_dlc/extractor/litv.py | 148 + youtube_dlc/extractor/livejournal.py | 42 + youtube_dlc/extractor/liveleak.py | 191 + youtube_dlc/extractor/livestream.py | 366 ++ youtube_dlc/extractor/lnkgo.py | 88 + youtube_dlc/extractor/localnews8.py | 47 + youtube_dlc/extractor/lovehomeporn.py | 37 + youtube_dlc/extractor/lrt.py | 94 + youtube_dlc/extractor/lynda.py | 341 + youtube_dlc/extractor/m6.py | 25 + youtube_dlc/extractor/magentamusik360.py | 61 + youtube_dlc/extractor/mailru.py | 335 + youtube_dlc/extractor/malltv.py | 56 + youtube_dlc/extractor/mangomolo.py | 58 + youtube_dlc/extractor/manyvids.py | 92 + youtube_dlc/extractor/markiza.py | 125 + youtube_dlc/extractor/massengeschmacktv.py | 77 + youtube_dlc/extractor/matchtv.py | 55 + youtube_dlc/extractor/mdr.py | 184 + youtube_dlc/extractor/medialaan.py | 269 + youtube_dlc/extractor/mediaset.py | 179 + youtube_dlc/extractor/mediasite.py | 366 ++ youtube_dlc/extractor/medici.py | 70 + youtube_dlc/extractor/megaphone.py | 55 + youtube_dlc/extractor/meipai.py | 104 + youtube_dlc/extractor/melonvod.py | 72 + youtube_dlc/extractor/meta.py | 73 + youtube_dlc/extractor/metacafe.py | 287 + youtube_dlc/extractor/metacritic.py | 65 + youtube_dlc/extractor/mgoon.py | 87 + youtube_dlc/extractor/mgtv.py | 96 + youtube_dlc/extractor/miaopai.py | 40 + .../extractor/microsoftvirtualacademy.py | 195 + youtube_dlc/extractor/ministrygrid.py | 57 + youtube_dlc/extractor/minoto.py | 51 + youtube_dlc/extractor/miomio.py | 141 + youtube_dlc/extractor/mit.py | 132 + youtube_dlc/extractor/mitele.py | 120 + youtube_dlc/extractor/mixcloud.py | 351 + youtube_dlc/extractor/mlb.py | 120 + youtube_dlc/extractor/mnet.py | 89 + youtube_dlc/extractor/moevideo.py | 79 + youtube_dlc/extractor/mofosex.py | 79 + youtube_dlc/extractor/mojvideo.py | 58 + youtube_dlc/extractor/morningstar.py | 50 + youtube_dlc/extractor/motherless.py | 207 + youtube_dlc/extractor/motorsport.py | 49 + youtube_dlc/extractor/movieclips.py | 49 + youtube_dlc/extractor/moviezine.py | 45 + youtube_dlc/extractor/movingimage.py | 52 + youtube_dlc/extractor/msn.py | 171 + youtube_dlc/extractor/mtv.py | 514 ++ youtube_dlc/extractor/muenchentv.py | 75 + youtube_dlc/extractor/mwave.py | 90 + youtube_dlc/extractor/mychannels.py | 40 + youtube_dlc/extractor/myspace.py | 212 + youtube_dlc/extractor/myspass.py | 56 + youtube_dlc/extractor/myvi.py | 111 + youtube_dlc/extractor/myvideoge.py | 56 + youtube_dlc/extractor/myvidster.py | 29 + youtube_dlc/extractor/nationalgeographic.py | 82 + youtube_dlc/extractor/naver.py | 251 + youtube_dlc/extractor/nba.py | 154 + youtube_dlc/extractor/nbc.py | 541 ++ youtube_dlc/extractor/ndr.py | 405 ++ youtube_dlc/extractor/ndtv.py | 115 + youtube_dlc/extractor/nerdcubed.py | 36 + youtube_dlc/extractor/neteasemusic.py | 485 ++ youtube_dlc/extractor/netzkino.py | 89 + youtube_dlc/extractor/newgrounds.py | 168 + youtube_dlc/extractor/newstube.py | 83 + youtube_dlc/extractor/nextmedia.py | 238 + youtube_dlc/extractor/nexx.py | 453 ++ youtube_dlc/extractor/nfl.py | 231 + youtube_dlc/extractor/nhk.py | 93 + youtube_dlc/extractor/nhl.py | 128 + youtube_dlc/extractor/nick.py | 249 + youtube_dlc/extractor/niconico.py | 470 ++ youtube_dlc/extractor/ninecninemedia.py | 102 + youtube_dlc/extractor/ninegag.py | 104 + youtube_dlc/extractor/ninenow.py | 93 + youtube_dlc/extractor/nintendo.py | 60 + youtube_dlc/extractor/njpwworld.py | 98 + youtube_dlc/extractor/nobelprize.py | 62 + youtube_dlc/extractor/noco.py | 235 + youtube_dlc/extractor/nonktube.py | 38 + youtube_dlc/extractor/noovo.py | 104 + youtube_dlc/extractor/normalboots.py | 54 + youtube_dlc/extractor/nosvideo.py | 75 + youtube_dlc/extractor/nova.py | 305 + youtube_dlc/extractor/nowness.py | 147 + youtube_dlc/extractor/noz.py | 89 + youtube_dlc/extractor/npo.py | 767 +++ youtube_dlc/extractor/npr.py | 124 + youtube_dlc/extractor/nrk.py | 723 +++ youtube_dlc/extractor/nrl.py | 30 + youtube_dlc/extractor/ntvcojp.py | 49 + youtube_dlc/extractor/ntvde.py | 77 + youtube_dlc/extractor/ntvru.py | 131 + youtube_dlc/extractor/nuevo.py | 39 + youtube_dlc/extractor/nuvid.py | 71 + youtube_dlc/extractor/nytimes.py | 223 + youtube_dlc/extractor/nzz.py | 43 + youtube_dlc/extractor/odatv.py | 50 + youtube_dlc/extractor/odnoklassniki.py | 268 + youtube_dlc/extractor/oktoberfesttv.py | 47 + youtube_dlc/extractor/once.py | 43 + youtube_dlc/extractor/ondemandkorea.py | 86 + youtube_dlc/extractor/onet.py | 268 + youtube_dlc/extractor/onionstudios.py | 53 + youtube_dlc/extractor/ooyala.py | 210 + youtube_dlc/extractor/openload.py | 238 + youtube_dlc/extractor/ora.py | 75 + youtube_dlc/extractor/orf.py | 570 ++ youtube_dlc/extractor/outsidetv.py | 28 + youtube_dlc/extractor/packtpub.py | 164 + youtube_dlc/extractor/pandoratv.py | 134 + youtube_dlc/extractor/parliamentliveuk.py | 43 + youtube_dlc/extractor/patreon.py | 156 + youtube_dlc/extractor/pbs.py | 710 ++ youtube_dlc/extractor/pearvideo.py | 63 + youtube_dlc/extractor/peertube.py | 600 ++ youtube_dlc/extractor/people.py | 32 + youtube_dlc/extractor/performgroup.py | 83 + youtube_dlc/extractor/periscope.py | 189 + youtube_dlc/extractor/philharmoniedeparis.py | 106 + youtube_dlc/extractor/phoenix.py | 52 + youtube_dlc/extractor/photobucket.py | 46 + youtube_dlc/extractor/picarto.py | 153 + youtube_dlc/extractor/piksel.py | 138 + youtube_dlc/extractor/pinkbike.py | 97 + youtube_dlc/extractor/pladform.py | 125 + youtube_dlc/extractor/platzi.py | 224 + youtube_dlc/extractor/playfm.py | 75 + youtube_dlc/extractor/playplustv.py | 109 + youtube_dlc/extractor/plays.py | 53 + youtube_dlc/extractor/playtvak.py | 191 + youtube_dlc/extractor/playvid.py | 99 + youtube_dlc/extractor/playwire.py | 75 + youtube_dlc/extractor/pluralsight.py | 501 ++ youtube_dlc/extractor/podomatic.py | 76 + youtube_dlc/extractor/pokemon.py | 138 + youtube_dlc/extractor/polskieradio.py | 180 + youtube_dlc/extractor/popcorntimes.py | 99 + youtube_dlc/extractor/popcorntv.py | 76 + youtube_dlc/extractor/porn91.py | 63 + youtube_dlc/extractor/porncom.py | 103 + youtube_dlc/extractor/pornhd.py | 121 + youtube_dlc/extractor/pornhub.py | 618 ++ youtube_dlc/extractor/pornotube.py | 85 + youtube_dlc/extractor/pornovoisines.py | 108 + youtube_dlc/extractor/pornoxo.py | 58 + youtube_dlc/extractor/presstv.py | 74 + youtube_dlc/extractor/prosiebensat1.py | 500 ++ youtube_dlc/extractor/puhutv.py | 239 + youtube_dlc/extractor/puls4.py | 57 + youtube_dlc/extractor/pyvideo.py | 72 + youtube_dlc/extractor/qqmusic.py | 369 ++ youtube_dlc/extractor/r7.py | 112 + youtube_dlc/extractor/radiobremen.py | 63 + youtube_dlc/extractor/radiocanada.py | 171 + youtube_dlc/extractor/radiode.py | 52 + youtube_dlc/extractor/radiofrance.py | 59 + youtube_dlc/extractor/radiojavan.py | 83 + youtube_dlc/extractor/rai.py | 473 ++ youtube_dlc/extractor/raywenderlich.py | 179 + youtube_dlc/extractor/rbmaradio.py | 72 + youtube_dlc/extractor/rds.py | 70 + youtube_dlc/extractor/redbulltv.py | 229 + youtube_dlc/extractor/reddit.py | 130 + youtube_dlc/extractor/redtube.py | 136 + youtube_dlc/extractor/regiotv.py | 62 + youtube_dlc/extractor/rentv.py | 106 + youtube_dlc/extractor/restudy.py | 44 + youtube_dlc/extractor/reuters.py | 69 + youtube_dlc/extractor/reverbnation.py | 53 + youtube_dlc/extractor/rice.py | 116 + youtube_dlc/extractor/rmcdecouverte.py | 55 + youtube_dlc/extractor/ro220.py | 43 + youtube_dlc/extractor/rockstargames.py | 69 + youtube_dlc/extractor/roosterteeth.py | 137 + youtube_dlc/extractor/rottentomatoes.py | 32 + youtube_dlc/extractor/roxwel.py | 53 + youtube_dlc/extractor/rozhlas.py | 50 + youtube_dlc/extractor/rtbf.py | 161 + youtube_dlc/extractor/rte.py | 167 + youtube_dlc/extractor/rtl2.py | 207 + youtube_dlc/extractor/rtlnl.py | 146 + youtube_dlc/extractor/rtp.py | 66 + youtube_dlc/extractor/rts.py | 230 + youtube_dlc/extractor/rtve.py | 292 + youtube_dlc/extractor/rtvnh.py | 62 + youtube_dlc/extractor/rtvs.py | 47 + youtube_dlc/extractor/ruhd.py | 45 + youtube_dlc/extractor/rutube.py | 313 + youtube_dlc/extractor/rutv.py | 211 + youtube_dlc/extractor/ruutu.py | 153 + youtube_dlc/extractor/ruv.py | 101 + youtube_dlc/extractor/safari.py | 264 + youtube_dlc/extractor/sapo.py | 119 + youtube_dlc/extractor/savefrom.py | 34 + youtube_dlc/extractor/sbs.py | 66 + youtube_dlc/extractor/screencast.py | 123 + youtube_dlc/extractor/screencastomatic.py | 37 + youtube_dlc/extractor/scrippsnetworks.py | 152 + youtube_dlc/extractor/scte.py | 144 + youtube_dlc/extractor/seeker.py | 58 + youtube_dlc/extractor/senateisvp.py | 153 + youtube_dlc/extractor/sendtonews.py | 105 + youtube_dlc/extractor/servus.py | 69 + youtube_dlc/extractor/sevenplus.py | 84 + youtube_dlc/extractor/sexu.py | 63 + youtube_dlc/extractor/seznamzpravy.py | 169 + youtube_dlc/extractor/shahid.py | 215 + youtube_dlc/extractor/shared.py | 138 + youtube_dlc/extractor/showroomlive.py | 84 + youtube_dlc/extractor/sina.py | 115 + youtube_dlc/extractor/sixplay.py | 129 + youtube_dlc/extractor/sky.py | 70 + youtube_dlc/extractor/skylinewebcams.py | 42 + youtube_dlc/extractor/skynewsarabia.py | 117 + youtube_dlc/extractor/slideshare.py | 56 + youtube_dlc/extractor/slideslive.py | 61 + youtube_dlc/extractor/slutload.py | 65 + youtube_dlc/extractor/smotri.py | 416 ++ youtube_dlc/extractor/snotr.py | 73 + youtube_dlc/extractor/sohu.py | 202 + youtube_dlc/extractor/sonyliv.py | 40 + youtube_dlc/extractor/soundcloud.py | 906 +++ youtube_dlc/extractor/soundgasm.py | 77 + youtube_dlc/extractor/southpark.py | 115 + youtube_dlc/extractor/spankbang.py | 184 + youtube_dlc/extractor/spankwire.py | 182 + youtube_dlc/extractor/spiegel.py | 159 + youtube_dlc/extractor/spiegeltv.py | 17 + youtube_dlc/extractor/spike.py | 65 + youtube_dlc/extractor/sport5.py | 92 + youtube_dlc/extractor/sportbox.py | 99 + youtube_dlc/extractor/sportdeutschland.py | 82 + youtube_dlc/extractor/springboardplatform.py | 125 + youtube_dlc/extractor/sprout.py | 52 + youtube_dlc/extractor/srgssr.py | 192 + youtube_dlc/extractor/srmediathek.py | 59 + youtube_dlc/extractor/stanfordoc.py | 91 + youtube_dlc/extractor/steam.py | 149 + youtube_dlc/extractor/stitcher.py | 81 + youtube_dlc/extractor/storyfire.py | 255 + youtube_dlc/extractor/streamable.py | 112 + youtube_dlc/extractor/streamcloud.py | 78 + youtube_dlc/extractor/streamcz.py | 105 + youtube_dlc/extractor/streetvoice.py | 49 + youtube_dlc/extractor/stretchinternet.py | 32 + youtube_dlc/extractor/stv.py | 67 + youtube_dlc/extractor/sunporno.py | 79 + youtube_dlc/extractor/sverigesradio.py | 115 + youtube_dlc/extractor/svt.py | 388 ++ youtube_dlc/extractor/swrmediathek.py | 115 + youtube_dlc/extractor/syfy.py | 58 + youtube_dlc/extractor/sztvhu.py | 41 + youtube_dlc/extractor/tagesschau.py | 311 + youtube_dlc/extractor/tass.py | 62 + youtube_dlc/extractor/tastytrade.py | 43 + youtube_dlc/extractor/tbs.py | 89 + youtube_dlc/extractor/tdslifeway.py | 33 + youtube_dlc/extractor/teachable.py | 298 + youtube_dlc/extractor/teachertube.py | 129 + youtube_dlc/extractor/teachingchannel.py | 33 + youtube_dlc/extractor/teamcoco.py | 205 + youtube_dlc/extractor/teamtreehouse.py | 140 + youtube_dlc/extractor/techtalks.py | 82 + youtube_dlc/extractor/ted.py | 363 ++ youtube_dlc/extractor/tele13.py | 88 + youtube_dlc/extractor/tele5.py | 108 + youtube_dlc/extractor/telebruxelles.py | 76 + youtube_dlc/extractor/telecinco.py | 188 + youtube_dlc/extractor/telegraaf.py | 89 + youtube_dlc/extractor/telemb.py | 78 + youtube_dlc/extractor/telequebec.py | 238 + youtube_dlc/extractor/teletask.py | 53 + youtube_dlc/extractor/telewebion.py | 55 + youtube_dlc/extractor/tennistv.py | 112 + youtube_dlc/extractor/tenplay.py | 58 + youtube_dlc/extractor/testurl.py | 64 + youtube_dlc/extractor/tf1.py | 92 + youtube_dlc/extractor/tfo.py | 55 + youtube_dlc/extractor/theintercept.py | 49 + youtube_dlc/extractor/theplatform.py | 411 ++ youtube_dlc/extractor/thescene.py | 44 + youtube_dlc/extractor/thestar.py | 36 + youtube_dlc/extractor/thesun.py | 38 + youtube_dlc/extractor/theweatherchannel.py | 79 + youtube_dlc/extractor/thisamericanlife.py | 40 + youtube_dlc/extractor/thisav.py | 73 + youtube_dlc/extractor/thisoldhouse.py | 47 + youtube_dlc/extractor/threeqsdn.py | 142 + youtube_dlc/extractor/tiktok.py | 139 + youtube_dlc/extractor/tinypic.py | 56 + youtube_dlc/extractor/tmz.py | 56 + youtube_dlc/extractor/tnaflix.py | 327 + youtube_dlc/extractor/toggle.py | 213 + youtube_dlc/extractor/tonline.py | 59 + youtube_dlc/extractor/toongoggles.py | 81 + youtube_dlc/extractor/toutv.py | 93 + youtube_dlc/extractor/toypics.py | 90 + youtube_dlc/extractor/traileraddict.py | 64 + youtube_dlc/extractor/trilulilu.py | 103 + youtube_dlc/extractor/trunews.py | 34 + youtube_dlc/extractor/trutv.py | 75 + youtube_dlc/extractor/tube8.py | 86 + youtube_dlc/extractor/tubitv.py | 96 + youtube_dlc/extractor/tudou.py | 49 + youtube_dlc/extractor/tumblr.py | 213 + youtube_dlc/extractor/tunein.py | 183 + youtube_dlc/extractor/tunepk.py | 90 + youtube_dlc/extractor/turbo.py | 68 + youtube_dlc/extractor/turner.py | 234 + youtube_dlc/extractor/tv2.py | 192 + youtube_dlc/extractor/tv2dk.py | 154 + youtube_dlc/extractor/tv2hu.py | 62 + youtube_dlc/extractor/tv4.py | 124 + youtube_dlc/extractor/tv5mondeplus.py | 117 + youtube_dlc/extractor/tva.py | 57 + youtube_dlc/extractor/tvanouvelles.py | 65 + youtube_dlc/extractor/tvc.py | 109 + youtube_dlc/extractor/tvigle.py | 138 + youtube_dlc/extractor/tvland.py | 37 + youtube_dlc/extractor/tvn24.py | 103 + youtube_dlc/extractor/tvnet.py | 147 + youtube_dlc/extractor/tvnoe.py | 48 + youtube_dlc/extractor/tvnow.py | 644 ++ youtube_dlc/extractor/tvp.py | 252 + youtube_dlc/extractor/tvplay.py | 512 ++ youtube_dlc/extractor/tvplayer.py | 86 + youtube_dlc/extractor/tweakers.py | 62 + youtube_dlc/extractor/twentyfourvideo.py | 133 + youtube_dlc/extractor/twentymin.py | 91 + youtube_dlc/extractor/twentythreevideo.py | 77 + youtube_dlc/extractor/twitcasting.py | 81 + youtube_dlc/extractor/twitch.py | 992 +++ youtube_dlc/extractor/twitter.py | 610 ++ youtube_dlc/extractor/udemy.py | 481 ++ youtube_dlc/extractor/udn.py | 102 + youtube_dlc/extractor/ufctv.py | 16 + youtube_dlc/extractor/uktvplay.py | 33 + youtube_dlc/extractor/umg.py | 103 + youtube_dlc/extractor/unistra.py | 67 + youtube_dlc/extractor/unity.py | 32 + youtube_dlc/extractor/uol.py | 144 + youtube_dlc/extractor/uplynk.py | 70 + youtube_dlc/extractor/urort.py | 66 + youtube_dlc/extractor/urplay.py | 71 + youtube_dlc/extractor/usanetwork.py | 74 + youtube_dlc/extractor/usatoday.py | 63 + youtube_dlc/extractor/ustream.py | 281 + youtube_dlc/extractor/ustudio.py | 125 + youtube_dlc/extractor/varzesh3.py | 79 + youtube_dlc/extractor/vbox7.py | 105 + youtube_dlc/extractor/veehd.py | 118 + youtube_dlc/extractor/veoh.py | 103 + youtube_dlc/extractor/vesti.py | 121 + youtube_dlc/extractor/vevo.py | 374 ++ youtube_dlc/extractor/vgtv.py | 307 + youtube_dlc/extractor/vh1.py | 43 + youtube_dlc/extractor/vice.py | 337 + youtube_dlc/extractor/vidbit.py | 84 + youtube_dlc/extractor/viddler.py | 138 + youtube_dlc/extractor/videa.py | 164 + youtube_dlc/extractor/videodetective.py | 29 + youtube_dlc/extractor/videofyme.py | 52 + youtube_dlc/extractor/videomore.py | 307 + youtube_dlc/extractor/videopress.py | 96 + youtube_dlc/extractor/vidio.py | 77 + youtube_dlc/extractor/vidlii.py | 125 + youtube_dlc/extractor/vidme.py | 295 + youtube_dlc/extractor/vidzi.py | 68 + youtube_dlc/extractor/vier.py | 264 + youtube_dlc/extractor/viewlift.py | 250 + youtube_dlc/extractor/viidea.py | 202 + youtube_dlc/extractor/viki.py | 408 ++ youtube_dlc/extractor/vimeo.py | 1128 ++++ youtube_dlc/extractor/vimple.py | 61 + youtube_dlc/extractor/vine.py | 154 + youtube_dlc/extractor/viqeo.py | 99 + youtube_dlc/extractor/viu.py | 272 + youtube_dlc/extractor/vk.py | 678 ++ youtube_dlc/extractor/vlive.py | 367 ++ youtube_dlc/extractor/vodlocker.py | 80 + youtube_dlc/extractor/vodpl.py | 32 + youtube_dlc/extractor/vodplatform.py | 40 + youtube_dlc/extractor/voicerepublic.py | 62 + youtube_dlc/extractor/voot.py | 100 + youtube_dlc/extractor/voxmedia.py | 215 + youtube_dlc/extractor/vrak.py | 80 + youtube_dlc/extractor/vrt.py | 87 + youtube_dlc/extractor/vrv.py | 277 + youtube_dlc/extractor/vshare.py | 74 + youtube_dlc/extractor/vube.py | 172 + youtube_dlc/extractor/vuclip.py | 70 + youtube_dlc/extractor/vvvvid.py | 158 + youtube_dlc/extractor/vyborymos.py | 55 + youtube_dlc/extractor/vzaar.py | 112 + youtube_dlc/extractor/wakanim.py | 66 + youtube_dlc/extractor/walla.py | 86 + youtube_dlc/extractor/washingtonpost.py | 183 + youtube_dlc/extractor/wat.py | 157 + youtube_dlc/extractor/watchbox.py | 161 + youtube_dlc/extractor/watchindianporn.py | 68 + youtube_dlc/extractor/wdr.py | 331 + youtube_dlc/extractor/webcaster.py | 102 + youtube_dlc/extractor/webofstories.py | 160 + youtube_dlc/extractor/weibo.py | 140 + youtube_dlc/extractor/weiqitv.py | 52 + youtube_dlc/extractor/wistia.py | 162 + youtube_dlc/extractor/worldstarhiphop.py | 40 + youtube_dlc/extractor/wsj.py | 123 + youtube_dlc/extractor/wwe.py | 140 + youtube_dlc/extractor/xbef.py | 44 + youtube_dlc/extractor/xboxclips.py | 53 + youtube_dlc/extractor/xfileshare.py | 193 + youtube_dlc/extractor/xhamster.py | 394 ++ youtube_dlc/extractor/xiami.py | 201 + youtube_dlc/extractor/ximalaya.py | 233 + youtube_dlc/extractor/xminus.py | 79 + youtube_dlc/extractor/xnxx.py | 84 + youtube_dlc/extractor/xstream.py | 119 + youtube_dlc/extractor/xtube.py | 200 + youtube_dlc/extractor/xuite.py | 153 + youtube_dlc/extractor/xvideos.py | 147 + youtube_dlc/extractor/xxxymovies.py | 81 + youtube_dlc/extractor/yahoo.py | 569 ++ youtube_dlc/extractor/yandexdisk.py | 118 + youtube_dlc/extractor/yandexmusic.py | 313 + youtube_dlc/extractor/yandexvideo.py | 104 + youtube_dlc/extractor/yapfiles.py | 101 + youtube_dlc/extractor/yesjapan.py | 62 + youtube_dlc/extractor/yinyuetai.py | 56 + youtube_dlc/extractor/ynet.py | 52 + youtube_dlc/extractor/youjizz.py | 95 + youtube_dlc/extractor/youku.py | 309 + youtube_dlc/extractor/younow.py | 202 + youtube_dlc/extractor/youporn.py | 203 + youtube_dlc/extractor/yourporn.py | 67 + youtube_dlc/extractor/yourupload.py | 46 + youtube_dlc/extractor/youtube.py | 3626 +++++++++++ youtube_dlc/extractor/zapiks.py | 109 + youtube_dlc/extractor/zaq1.py | 101 + youtube_dlc/extractor/zattoo.py | 433 ++ youtube_dlc/extractor/zdf.py | 332 + youtube_dlc/extractor/zingmp3.py | 143 + youtube_dlc/extractor/zype.py | 134 + youtube_dlc/jsinterp.py | 262 + youtube_dlc/options.py | 946 +++ youtube_dlc/postprocessor/__init__.py | 42 + youtube_dlc/postprocessor/common.py | 69 + youtube_dlc/postprocessor/embedthumbnail.py | 145 + .../postprocessor/execafterdownload.py | 31 + youtube_dlc/postprocessor/ffmpeg.py | 683 ++ .../postprocessor/metadatafromtitle.py | 48 + youtube_dlc/postprocessor/xattrpp.py | 79 + youtube_dlc/socks.py | 273 + youtube_dlc/swfinterp.py | 834 +++ youtube_dlc/update.py | 190 + youtube_dlc/utils.py | 5715 +++++++++++++++++ youtube_dlc/version.py | 3 + 891 files changed, 157099 insertions(+) create mode 100644 AUTHORS create mode 100644 ChangeLog create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 PKG-INFO create mode 100644 README.md create mode 100644 README.txt create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/module_guide.rst create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 test/__init__.py create mode 100644 test/helper.py create mode 100644 test/parameters.json create mode 100644 test/swftests/.gitignore create mode 100644 test/swftests/ArrayAccess.as create mode 100644 test/swftests/ClassCall.as create mode 100644 test/swftests/ClassConstruction.as create mode 100644 test/swftests/ConstArrayAccess.as create mode 100644 test/swftests/ConstantInt.as create mode 100644 test/swftests/DictCall.as create mode 100644 test/swftests/EqualsOperator.as create mode 100644 test/swftests/LocalVars.as create mode 100644 test/swftests/MemberAssignment.as create mode 100644 test/swftests/NeOperator.as create mode 100644 test/swftests/PrivateCall.as create mode 100644 test/swftests/PrivateVoidCall.as create mode 100644 test/swftests/StaticAssignment.as create mode 100644 test/swftests/StaticRetrieval.as create mode 100644 test/swftests/StringBasics.as create mode 100644 test/swftests/StringCharCodeAt.as create mode 100644 test/swftests/StringConversion.as create mode 100644 test/test_InfoExtractor.py create mode 100644 test/test_YoutubeDL.py create mode 100644 test/test_YoutubeDLCookieJar.py create mode 100644 test/test_aes.py create mode 100644 test/test_age_restriction.py create mode 100644 test/test_all_urls.py create mode 100644 test/test_cache.py create mode 100644 test/test_compat.py create mode 100644 test/test_download.py create mode 100644 test/test_downloader_http.py create mode 100644 test/test_execution.py create mode 100644 test/test_http.py create mode 100644 test/test_iqiyi_sdk_interpreter.py create mode 100644 test/test_jsinterp.py create mode 100644 test/test_netrc.py create mode 100644 test/test_options.py create mode 100644 test/test_postprocessors.py create mode 100644 test/test_socks.py create mode 100644 test/test_subtitles.py create mode 100644 test/test_swfinterp.py create mode 100644 test/test_unicode_literals.py create mode 100644 test/test_update.py create mode 100644 test/test_utils.py create mode 100644 test/test_verbose_output.py create mode 100644 test/test_write_annotations.py create mode 100644 test/test_youtube_chapters.py create mode 100644 test/test_youtube_lists.py create mode 100644 test/test_youtube_signature.py create mode 100644 test/testcert.pem create mode 100644 test/testdata/cookies/httponly_cookies.txt create mode 100644 test/testdata/cookies/malformed_cookies.txt create mode 100644 test/testdata/cookies/session_cookies.txt create mode 100644 test/testdata/f4m/custom_base_url.f4m create mode 100644 test/testdata/m3u8/pluzz_francetv_11507.m3u8 create mode 100644 test/testdata/m3u8/teamcoco_11995.m3u8 create mode 100644 test/testdata/m3u8/ted_18923.m3u8 create mode 100644 test/testdata/m3u8/toggle_mobile_12211.m3u8 create mode 100644 test/testdata/m3u8/twitch_vod.m3u8 create mode 100644 test/testdata/m3u8/vidio.m3u8 create mode 100644 test/testdata/mpd/float_duration.mpd create mode 100644 test/testdata/mpd/unfragmented.mpd create mode 100644 test/testdata/mpd/urls_only.mpd create mode 100644 test/testdata/xspf/foo_xspf.xspf create mode 100644 test/versions.json create mode 100644 youtube-dlc.1 create mode 100644 youtube-dlc.bash-completion create mode 100644 youtube-dlc.fish create mode 100644 youtube_dlc.egg-info/PKG-INFO create mode 100644 youtube_dlc.egg-info/SOURCES.txt create mode 100644 youtube_dlc.egg-info/dependency_links.txt create mode 100644 youtube_dlc.egg-info/entry_points.txt create mode 100644 youtube_dlc.egg-info/top_level.txt create mode 100644 youtube_dlc/YoutubeDL.py create mode 100644 youtube_dlc/__init__.py create mode 100644 youtube_dlc/__main__.py create mode 100644 youtube_dlc/aes.py create mode 100644 youtube_dlc/cache.py create mode 100644 youtube_dlc/compat.py create mode 100644 youtube_dlc/downloader/__init__.py create mode 100644 youtube_dlc/downloader/common.py create mode 100644 youtube_dlc/downloader/dash.py create mode 100644 youtube_dlc/downloader/external.py create mode 100644 youtube_dlc/downloader/f4m.py create mode 100644 youtube_dlc/downloader/fragment.py create mode 100644 youtube_dlc/downloader/hls.py create mode 100644 youtube_dlc/downloader/http.py create mode 100644 youtube_dlc/downloader/ism.py create mode 100644 youtube_dlc/downloader/rtmp.py create mode 100644 youtube_dlc/downloader/rtsp.py create mode 100644 youtube_dlc/downloader/youtube_live_chat.py create mode 100644 youtube_dlc/extractor/__init__.py create mode 100644 youtube_dlc/extractor/abc.py create mode 100644 youtube_dlc/extractor/abcnews.py create mode 100644 youtube_dlc/extractor/abcotvs.py create mode 100644 youtube_dlc/extractor/academicearth.py create mode 100644 youtube_dlc/extractor/acast.py create mode 100644 youtube_dlc/extractor/adn.py create mode 100644 youtube_dlc/extractor/adobeconnect.py create mode 100644 youtube_dlc/extractor/adobepass.py create mode 100644 youtube_dlc/extractor/adobetv.py create mode 100644 youtube_dlc/extractor/adultswim.py create mode 100644 youtube_dlc/extractor/aenetworks.py create mode 100644 youtube_dlc/extractor/afreecatv.py create mode 100644 youtube_dlc/extractor/airmozilla.py create mode 100644 youtube_dlc/extractor/aliexpress.py create mode 100644 youtube_dlc/extractor/aljazeera.py create mode 100644 youtube_dlc/extractor/allocine.py create mode 100644 youtube_dlc/extractor/alphaporno.py create mode 100644 youtube_dlc/extractor/alura.py create mode 100644 youtube_dlc/extractor/amcnetworks.py create mode 100644 youtube_dlc/extractor/americastestkitchen.py create mode 100644 youtube_dlc/extractor/amp.py create mode 100644 youtube_dlc/extractor/animeondemand.py create mode 100644 youtube_dlc/extractor/anvato.py create mode 100644 youtube_dlc/extractor/aol.py create mode 100644 youtube_dlc/extractor/apa.py create mode 100644 youtube_dlc/extractor/aparat.py create mode 100644 youtube_dlc/extractor/appleconnect.py create mode 100644 youtube_dlc/extractor/appletrailers.py create mode 100644 youtube_dlc/extractor/archiveorg.py create mode 100644 youtube_dlc/extractor/ard.py create mode 100644 youtube_dlc/extractor/arkena.py create mode 100644 youtube_dlc/extractor/arte.py create mode 100644 youtube_dlc/extractor/asiancrush.py create mode 100644 youtube_dlc/extractor/atresplayer.py create mode 100644 youtube_dlc/extractor/atttechchannel.py create mode 100644 youtube_dlc/extractor/atvat.py create mode 100644 youtube_dlc/extractor/audimedia.py create mode 100644 youtube_dlc/extractor/audioboom.py create mode 100644 youtube_dlc/extractor/audiomack.py create mode 100644 youtube_dlc/extractor/awaan.py create mode 100644 youtube_dlc/extractor/aws.py create mode 100644 youtube_dlc/extractor/azmedien.py create mode 100644 youtube_dlc/extractor/baidu.py create mode 100644 youtube_dlc/extractor/bandcamp.py create mode 100644 youtube_dlc/extractor/bbc.py create mode 100644 youtube_dlc/extractor/beampro.py create mode 100644 youtube_dlc/extractor/beatport.py create mode 100644 youtube_dlc/extractor/beeg.py create mode 100644 youtube_dlc/extractor/behindkink.py create mode 100644 youtube_dlc/extractor/bellmedia.py create mode 100644 youtube_dlc/extractor/bet.py create mode 100644 youtube_dlc/extractor/bfi.py create mode 100644 youtube_dlc/extractor/bigflix.py create mode 100644 youtube_dlc/extractor/bild.py create mode 100644 youtube_dlc/extractor/bilibili.py create mode 100644 youtube_dlc/extractor/biobiochiletv.py create mode 100644 youtube_dlc/extractor/biqle.py create mode 100644 youtube_dlc/extractor/bitchute.py create mode 100644 youtube_dlc/extractor/bleacherreport.py create mode 100644 youtube_dlc/extractor/blinkx.py create mode 100644 youtube_dlc/extractor/bloomberg.py create mode 100644 youtube_dlc/extractor/bokecc.py create mode 100644 youtube_dlc/extractor/bostonglobe.py create mode 100644 youtube_dlc/extractor/bpb.py create mode 100644 youtube_dlc/extractor/br.py create mode 100644 youtube_dlc/extractor/bravotv.py create mode 100644 youtube_dlc/extractor/breakcom.py create mode 100644 youtube_dlc/extractor/brightcove.py create mode 100644 youtube_dlc/extractor/businessinsider.py create mode 100644 youtube_dlc/extractor/buzzfeed.py create mode 100644 youtube_dlc/extractor/byutv.py create mode 100644 youtube_dlc/extractor/c56.py create mode 100644 youtube_dlc/extractor/camdemy.py create mode 100644 youtube_dlc/extractor/cammodels.py create mode 100644 youtube_dlc/extractor/camtube.py create mode 100644 youtube_dlc/extractor/camwithher.py create mode 100644 youtube_dlc/extractor/canalc2.py create mode 100644 youtube_dlc/extractor/canalplus.py create mode 100644 youtube_dlc/extractor/canvas.py create mode 100644 youtube_dlc/extractor/carambatv.py create mode 100644 youtube_dlc/extractor/cartoonnetwork.py create mode 100644 youtube_dlc/extractor/cbc.py create mode 100644 youtube_dlc/extractor/cbs.py create mode 100644 youtube_dlc/extractor/cbsinteractive.py create mode 100644 youtube_dlc/extractor/cbslocal.py create mode 100644 youtube_dlc/extractor/cbsnews.py create mode 100644 youtube_dlc/extractor/cbssports.py create mode 100644 youtube_dlc/extractor/ccc.py create mode 100644 youtube_dlc/extractor/ccma.py create mode 100644 youtube_dlc/extractor/cctv.py create mode 100644 youtube_dlc/extractor/cda.py create mode 100644 youtube_dlc/extractor/ceskatelevize.py create mode 100644 youtube_dlc/extractor/channel9.py create mode 100644 youtube_dlc/extractor/charlierose.py create mode 100644 youtube_dlc/extractor/chaturbate.py create mode 100644 youtube_dlc/extractor/chilloutzone.py create mode 100644 youtube_dlc/extractor/chirbit.py create mode 100644 youtube_dlc/extractor/cinchcast.py create mode 100644 youtube_dlc/extractor/cinemax.py create mode 100644 youtube_dlc/extractor/ciscolive.py create mode 100644 youtube_dlc/extractor/cjsw.py create mode 100644 youtube_dlc/extractor/cliphunter.py create mode 100644 youtube_dlc/extractor/clippit.py create mode 100644 youtube_dlc/extractor/cliprs.py create mode 100644 youtube_dlc/extractor/clipsyndicate.py create mode 100644 youtube_dlc/extractor/closertotruth.py create mode 100644 youtube_dlc/extractor/cloudflarestream.py create mode 100644 youtube_dlc/extractor/cloudy.py create mode 100644 youtube_dlc/extractor/clubic.py create mode 100644 youtube_dlc/extractor/clyp.py create mode 100644 youtube_dlc/extractor/cmt.py create mode 100644 youtube_dlc/extractor/cnbc.py create mode 100644 youtube_dlc/extractor/cnn.py create mode 100644 youtube_dlc/extractor/comedycentral.py create mode 100644 youtube_dlc/extractor/common.py create mode 100644 youtube_dlc/extractor/commonmistakes.py create mode 100644 youtube_dlc/extractor/commonprotocols.py create mode 100644 youtube_dlc/extractor/condenast.py create mode 100644 youtube_dlc/extractor/contv.py create mode 100644 youtube_dlc/extractor/corus.py create mode 100644 youtube_dlc/extractor/coub.py create mode 100644 youtube_dlc/extractor/cracked.py create mode 100644 youtube_dlc/extractor/crackle.py create mode 100644 youtube_dlc/extractor/crooksandliars.py create mode 100644 youtube_dlc/extractor/crunchyroll.py create mode 100644 youtube_dlc/extractor/cspan.py create mode 100644 youtube_dlc/extractor/ctsnews.py create mode 100644 youtube_dlc/extractor/ctvnews.py create mode 100644 youtube_dlc/extractor/cultureunplugged.py create mode 100644 youtube_dlc/extractor/curiositystream.py create mode 100644 youtube_dlc/extractor/cwtv.py create mode 100644 youtube_dlc/extractor/dailymail.py create mode 100644 youtube_dlc/extractor/dailymotion.py create mode 100644 youtube_dlc/extractor/daum.py create mode 100644 youtube_dlc/extractor/dbtv.py create mode 100644 youtube_dlc/extractor/dctp.py create mode 100644 youtube_dlc/extractor/deezer.py create mode 100644 youtube_dlc/extractor/defense.py create mode 100644 youtube_dlc/extractor/democracynow.py create mode 100644 youtube_dlc/extractor/dfb.py create mode 100644 youtube_dlc/extractor/dhm.py create mode 100644 youtube_dlc/extractor/digg.py create mode 100644 youtube_dlc/extractor/digiteka.py create mode 100644 youtube_dlc/extractor/discovery.py create mode 100644 youtube_dlc/extractor/discoverygo.py create mode 100644 youtube_dlc/extractor/discoverynetworks.py create mode 100644 youtube_dlc/extractor/discoveryvr.py create mode 100644 youtube_dlc/extractor/disney.py create mode 100644 youtube_dlc/extractor/dispeak.py create mode 100644 youtube_dlc/extractor/dlive.py create mode 100644 youtube_dlc/extractor/doodstream.py create mode 100644 youtube_dlc/extractor/dotsub.py create mode 100644 youtube_dlc/extractor/douyutv.py create mode 100644 youtube_dlc/extractor/dplay.py create mode 100644 youtube_dlc/extractor/drbonanza.py create mode 100644 youtube_dlc/extractor/dropbox.py create mode 100644 youtube_dlc/extractor/drtuber.py create mode 100644 youtube_dlc/extractor/drtv.py create mode 100644 youtube_dlc/extractor/dtube.py create mode 100644 youtube_dlc/extractor/duboku.py create mode 100644 youtube_dlc/extractor/dumpert.py create mode 100644 youtube_dlc/extractor/dvtv.py create mode 100644 youtube_dlc/extractor/dw.py create mode 100644 youtube_dlc/extractor/eagleplatform.py create mode 100644 youtube_dlc/extractor/ebaumsworld.py create mode 100644 youtube_dlc/extractor/echomsk.py create mode 100644 youtube_dlc/extractor/egghead.py create mode 100644 youtube_dlc/extractor/ehow.py create mode 100644 youtube_dlc/extractor/eighttracks.py create mode 100644 youtube_dlc/extractor/einthusan.py create mode 100644 youtube_dlc/extractor/eitb.py create mode 100644 youtube_dlc/extractor/ellentube.py create mode 100644 youtube_dlc/extractor/elonet.py create mode 100644 youtube_dlc/extractor/elpais.py create mode 100644 youtube_dlc/extractor/embedly.py create mode 100644 youtube_dlc/extractor/engadget.py create mode 100644 youtube_dlc/extractor/eporner.py create mode 100644 youtube_dlc/extractor/eroprofile.py create mode 100644 youtube_dlc/extractor/escapist.py create mode 100644 youtube_dlc/extractor/espn.py create mode 100644 youtube_dlc/extractor/esri.py create mode 100644 youtube_dlc/extractor/europa.py create mode 100644 youtube_dlc/extractor/everyonesmixtape.py create mode 100644 youtube_dlc/extractor/expotv.py create mode 100644 youtube_dlc/extractor/expressen.py create mode 100644 youtube_dlc/extractor/extractors.py create mode 100644 youtube_dlc/extractor/extremetube.py create mode 100644 youtube_dlc/extractor/eyedotv.py create mode 100644 youtube_dlc/extractor/facebook.py create mode 100644 youtube_dlc/extractor/faz.py create mode 100644 youtube_dlc/extractor/fc2.py create mode 100644 youtube_dlc/extractor/fczenit.py create mode 100644 youtube_dlc/extractor/filmon.py create mode 100644 youtube_dlc/extractor/filmweb.py create mode 100644 youtube_dlc/extractor/firsttv.py create mode 100644 youtube_dlc/extractor/fivemin.py create mode 100644 youtube_dlc/extractor/fivetv.py create mode 100644 youtube_dlc/extractor/flickr.py create mode 100644 youtube_dlc/extractor/folketinget.py create mode 100644 youtube_dlc/extractor/footyroom.py create mode 100644 youtube_dlc/extractor/formula1.py create mode 100644 youtube_dlc/extractor/fourtube.py create mode 100644 youtube_dlc/extractor/fox.py create mode 100644 youtube_dlc/extractor/fox9.py create mode 100644 youtube_dlc/extractor/foxgay.py create mode 100644 youtube_dlc/extractor/foxnews.py create mode 100644 youtube_dlc/extractor/foxsports.py create mode 100644 youtube_dlc/extractor/franceculture.py create mode 100644 youtube_dlc/extractor/franceinter.py create mode 100644 youtube_dlc/extractor/francetv.py create mode 100644 youtube_dlc/extractor/freesound.py create mode 100644 youtube_dlc/extractor/freespeech.py create mode 100644 youtube_dlc/extractor/freshlive.py create mode 100644 youtube_dlc/extractor/frontendmasters.py create mode 100644 youtube_dlc/extractor/funimation.py create mode 100644 youtube_dlc/extractor/funk.py create mode 100644 youtube_dlc/extractor/fusion.py create mode 100644 youtube_dlc/extractor/fxnetworks.py create mode 100644 youtube_dlc/extractor/gaia.py create mode 100644 youtube_dlc/extractor/gameinformer.py create mode 100644 youtube_dlc/extractor/gamespot.py create mode 100644 youtube_dlc/extractor/gamestar.py create mode 100644 youtube_dlc/extractor/gaskrank.py create mode 100644 youtube_dlc/extractor/gazeta.py create mode 100644 youtube_dlc/extractor/gdcvault.py create mode 100644 youtube_dlc/extractor/generic.py create mode 100644 youtube_dlc/extractor/gfycat.py create mode 100644 youtube_dlc/extractor/giantbomb.py create mode 100644 youtube_dlc/extractor/giga.py create mode 100644 youtube_dlc/extractor/gigya.py create mode 100644 youtube_dlc/extractor/glide.py create mode 100644 youtube_dlc/extractor/globo.py create mode 100644 youtube_dlc/extractor/go.py create mode 100644 youtube_dlc/extractor/godtube.py create mode 100644 youtube_dlc/extractor/golem.py create mode 100644 youtube_dlc/extractor/googledrive.py create mode 100644 youtube_dlc/extractor/googleplus.py create mode 100644 youtube_dlc/extractor/googlesearch.py create mode 100644 youtube_dlc/extractor/goshgay.py create mode 100644 youtube_dlc/extractor/gputechconf.py create mode 100644 youtube_dlc/extractor/groupon.py create mode 100644 youtube_dlc/extractor/hbo.py create mode 100644 youtube_dlc/extractor/hearthisat.py create mode 100644 youtube_dlc/extractor/heise.py create mode 100644 youtube_dlc/extractor/hellporno.py create mode 100644 youtube_dlc/extractor/helsinki.py create mode 100644 youtube_dlc/extractor/hentaistigma.py create mode 100644 youtube_dlc/extractor/hgtv.py create mode 100644 youtube_dlc/extractor/hidive.py create mode 100644 youtube_dlc/extractor/historicfilms.py create mode 100644 youtube_dlc/extractor/hitbox.py create mode 100644 youtube_dlc/extractor/hitrecord.py create mode 100644 youtube_dlc/extractor/hketv.py create mode 100644 youtube_dlc/extractor/hornbunny.py create mode 100644 youtube_dlc/extractor/hotnewhiphop.py create mode 100644 youtube_dlc/extractor/hotstar.py create mode 100644 youtube_dlc/extractor/howcast.py create mode 100644 youtube_dlc/extractor/howstuffworks.py create mode 100644 youtube_dlc/extractor/hrfensehen.py create mode 100644 youtube_dlc/extractor/hrti.py create mode 100644 youtube_dlc/extractor/huajiao.py create mode 100644 youtube_dlc/extractor/huffpost.py create mode 100644 youtube_dlc/extractor/hungama.py create mode 100644 youtube_dlc/extractor/hypem.py create mode 100644 youtube_dlc/extractor/ign.py create mode 100644 youtube_dlc/extractor/imdb.py create mode 100644 youtube_dlc/extractor/imggaming.py create mode 100644 youtube_dlc/extractor/imgur.py create mode 100644 youtube_dlc/extractor/ina.py create mode 100644 youtube_dlc/extractor/inc.py create mode 100644 youtube_dlc/extractor/indavideo.py create mode 100644 youtube_dlc/extractor/infoq.py create mode 100644 youtube_dlc/extractor/instagram.py create mode 100644 youtube_dlc/extractor/internazionale.py create mode 100644 youtube_dlc/extractor/internetvideoarchive.py create mode 100644 youtube_dlc/extractor/iprima.py create mode 100644 youtube_dlc/extractor/iqiyi.py create mode 100644 youtube_dlc/extractor/ir90tv.py create mode 100644 youtube_dlc/extractor/itv.py create mode 100644 youtube_dlc/extractor/ivi.py create mode 100644 youtube_dlc/extractor/ivideon.py create mode 100644 youtube_dlc/extractor/iwara.py create mode 100644 youtube_dlc/extractor/izlesene.py create mode 100644 youtube_dlc/extractor/jamendo.py create mode 100644 youtube_dlc/extractor/jeuxvideo.py create mode 100644 youtube_dlc/extractor/joj.py create mode 100644 youtube_dlc/extractor/jove.py create mode 100644 youtube_dlc/extractor/jwplatform.py create mode 100644 youtube_dlc/extractor/kakao.py create mode 100644 youtube_dlc/extractor/kaltura.py create mode 100644 youtube_dlc/extractor/kanalplay.py create mode 100644 youtube_dlc/extractor/kankan.py create mode 100644 youtube_dlc/extractor/karaoketv.py create mode 100644 youtube_dlc/extractor/karrierevideos.py create mode 100644 youtube_dlc/extractor/keezmovies.py create mode 100644 youtube_dlc/extractor/ketnet.py create mode 100644 youtube_dlc/extractor/khanacademy.py create mode 100644 youtube_dlc/extractor/kickstarter.py create mode 100644 youtube_dlc/extractor/kinja.py create mode 100644 youtube_dlc/extractor/kinopoisk.py create mode 100644 youtube_dlc/extractor/konserthusetplay.py create mode 100644 youtube_dlc/extractor/krasview.py create mode 100644 youtube_dlc/extractor/ku6.py create mode 100644 youtube_dlc/extractor/kusi.py create mode 100644 youtube_dlc/extractor/kuwo.py create mode 100644 youtube_dlc/extractor/la7.py create mode 100644 youtube_dlc/extractor/laola1tv.py create mode 100644 youtube_dlc/extractor/lci.py create mode 100644 youtube_dlc/extractor/lcp.py create mode 100644 youtube_dlc/extractor/lecture2go.py create mode 100644 youtube_dlc/extractor/lecturio.py create mode 100644 youtube_dlc/extractor/leeco.py create mode 100644 youtube_dlc/extractor/lego.py create mode 100644 youtube_dlc/extractor/lemonde.py create mode 100644 youtube_dlc/extractor/lenta.py create mode 100644 youtube_dlc/extractor/libraryofcongress.py create mode 100644 youtube_dlc/extractor/libsyn.py create mode 100644 youtube_dlc/extractor/lifenews.py create mode 100644 youtube_dlc/extractor/limelight.py create mode 100644 youtube_dlc/extractor/line.py create mode 100644 youtube_dlc/extractor/linkedin.py create mode 100644 youtube_dlc/extractor/linuxacademy.py create mode 100644 youtube_dlc/extractor/litv.py create mode 100644 youtube_dlc/extractor/livejournal.py create mode 100644 youtube_dlc/extractor/liveleak.py create mode 100644 youtube_dlc/extractor/livestream.py create mode 100644 youtube_dlc/extractor/lnkgo.py create mode 100644 youtube_dlc/extractor/localnews8.py create mode 100644 youtube_dlc/extractor/lovehomeporn.py create mode 100644 youtube_dlc/extractor/lrt.py create mode 100644 youtube_dlc/extractor/lynda.py create mode 100644 youtube_dlc/extractor/m6.py create mode 100644 youtube_dlc/extractor/magentamusik360.py create mode 100644 youtube_dlc/extractor/mailru.py create mode 100644 youtube_dlc/extractor/malltv.py create mode 100644 youtube_dlc/extractor/mangomolo.py create mode 100644 youtube_dlc/extractor/manyvids.py create mode 100644 youtube_dlc/extractor/markiza.py create mode 100644 youtube_dlc/extractor/massengeschmacktv.py create mode 100644 youtube_dlc/extractor/matchtv.py create mode 100644 youtube_dlc/extractor/mdr.py create mode 100644 youtube_dlc/extractor/medialaan.py create mode 100644 youtube_dlc/extractor/mediaset.py create mode 100644 youtube_dlc/extractor/mediasite.py create mode 100644 youtube_dlc/extractor/medici.py create mode 100644 youtube_dlc/extractor/megaphone.py create mode 100644 youtube_dlc/extractor/meipai.py create mode 100644 youtube_dlc/extractor/melonvod.py create mode 100644 youtube_dlc/extractor/meta.py create mode 100644 youtube_dlc/extractor/metacafe.py create mode 100644 youtube_dlc/extractor/metacritic.py create mode 100644 youtube_dlc/extractor/mgoon.py create mode 100644 youtube_dlc/extractor/mgtv.py create mode 100644 youtube_dlc/extractor/miaopai.py create mode 100644 youtube_dlc/extractor/microsoftvirtualacademy.py create mode 100644 youtube_dlc/extractor/ministrygrid.py create mode 100644 youtube_dlc/extractor/minoto.py create mode 100644 youtube_dlc/extractor/miomio.py create mode 100644 youtube_dlc/extractor/mit.py create mode 100644 youtube_dlc/extractor/mitele.py create mode 100644 youtube_dlc/extractor/mixcloud.py create mode 100644 youtube_dlc/extractor/mlb.py create mode 100644 youtube_dlc/extractor/mnet.py create mode 100644 youtube_dlc/extractor/moevideo.py create mode 100644 youtube_dlc/extractor/mofosex.py create mode 100644 youtube_dlc/extractor/mojvideo.py create mode 100644 youtube_dlc/extractor/morningstar.py create mode 100644 youtube_dlc/extractor/motherless.py create mode 100644 youtube_dlc/extractor/motorsport.py create mode 100644 youtube_dlc/extractor/movieclips.py create mode 100644 youtube_dlc/extractor/moviezine.py create mode 100644 youtube_dlc/extractor/movingimage.py create mode 100644 youtube_dlc/extractor/msn.py create mode 100644 youtube_dlc/extractor/mtv.py create mode 100644 youtube_dlc/extractor/muenchentv.py create mode 100644 youtube_dlc/extractor/mwave.py create mode 100644 youtube_dlc/extractor/mychannels.py create mode 100644 youtube_dlc/extractor/myspace.py create mode 100644 youtube_dlc/extractor/myspass.py create mode 100644 youtube_dlc/extractor/myvi.py create mode 100644 youtube_dlc/extractor/myvideoge.py create mode 100644 youtube_dlc/extractor/myvidster.py create mode 100644 youtube_dlc/extractor/nationalgeographic.py create mode 100644 youtube_dlc/extractor/naver.py create mode 100644 youtube_dlc/extractor/nba.py create mode 100644 youtube_dlc/extractor/nbc.py create mode 100644 youtube_dlc/extractor/ndr.py create mode 100644 youtube_dlc/extractor/ndtv.py create mode 100644 youtube_dlc/extractor/nerdcubed.py create mode 100644 youtube_dlc/extractor/neteasemusic.py create mode 100644 youtube_dlc/extractor/netzkino.py create mode 100644 youtube_dlc/extractor/newgrounds.py create mode 100644 youtube_dlc/extractor/newstube.py create mode 100644 youtube_dlc/extractor/nextmedia.py create mode 100644 youtube_dlc/extractor/nexx.py create mode 100644 youtube_dlc/extractor/nfl.py create mode 100644 youtube_dlc/extractor/nhk.py create mode 100644 youtube_dlc/extractor/nhl.py create mode 100644 youtube_dlc/extractor/nick.py create mode 100644 youtube_dlc/extractor/niconico.py create mode 100644 youtube_dlc/extractor/ninecninemedia.py create mode 100644 youtube_dlc/extractor/ninegag.py create mode 100644 youtube_dlc/extractor/ninenow.py create mode 100644 youtube_dlc/extractor/nintendo.py create mode 100644 youtube_dlc/extractor/njpwworld.py create mode 100644 youtube_dlc/extractor/nobelprize.py create mode 100644 youtube_dlc/extractor/noco.py create mode 100644 youtube_dlc/extractor/nonktube.py create mode 100644 youtube_dlc/extractor/noovo.py create mode 100644 youtube_dlc/extractor/normalboots.py create mode 100644 youtube_dlc/extractor/nosvideo.py create mode 100644 youtube_dlc/extractor/nova.py create mode 100644 youtube_dlc/extractor/nowness.py create mode 100644 youtube_dlc/extractor/noz.py create mode 100644 youtube_dlc/extractor/npo.py create mode 100644 youtube_dlc/extractor/npr.py create mode 100644 youtube_dlc/extractor/nrk.py create mode 100644 youtube_dlc/extractor/nrl.py create mode 100644 youtube_dlc/extractor/ntvcojp.py create mode 100644 youtube_dlc/extractor/ntvde.py create mode 100644 youtube_dlc/extractor/ntvru.py create mode 100644 youtube_dlc/extractor/nuevo.py create mode 100644 youtube_dlc/extractor/nuvid.py create mode 100644 youtube_dlc/extractor/nytimes.py create mode 100644 youtube_dlc/extractor/nzz.py create mode 100644 youtube_dlc/extractor/odatv.py create mode 100644 youtube_dlc/extractor/odnoklassniki.py create mode 100644 youtube_dlc/extractor/oktoberfesttv.py create mode 100644 youtube_dlc/extractor/once.py create mode 100644 youtube_dlc/extractor/ondemandkorea.py create mode 100644 youtube_dlc/extractor/onet.py create mode 100644 youtube_dlc/extractor/onionstudios.py create mode 100644 youtube_dlc/extractor/ooyala.py create mode 100644 youtube_dlc/extractor/openload.py create mode 100644 youtube_dlc/extractor/ora.py create mode 100644 youtube_dlc/extractor/orf.py create mode 100644 youtube_dlc/extractor/outsidetv.py create mode 100644 youtube_dlc/extractor/packtpub.py create mode 100644 youtube_dlc/extractor/pandoratv.py create mode 100644 youtube_dlc/extractor/parliamentliveuk.py create mode 100644 youtube_dlc/extractor/patreon.py create mode 100644 youtube_dlc/extractor/pbs.py create mode 100644 youtube_dlc/extractor/pearvideo.py create mode 100644 youtube_dlc/extractor/peertube.py create mode 100644 youtube_dlc/extractor/people.py create mode 100644 youtube_dlc/extractor/performgroup.py create mode 100644 youtube_dlc/extractor/periscope.py create mode 100644 youtube_dlc/extractor/philharmoniedeparis.py create mode 100644 youtube_dlc/extractor/phoenix.py create mode 100644 youtube_dlc/extractor/photobucket.py create mode 100644 youtube_dlc/extractor/picarto.py create mode 100644 youtube_dlc/extractor/piksel.py create mode 100644 youtube_dlc/extractor/pinkbike.py create mode 100644 youtube_dlc/extractor/pladform.py create mode 100644 youtube_dlc/extractor/platzi.py create mode 100644 youtube_dlc/extractor/playfm.py create mode 100644 youtube_dlc/extractor/playplustv.py create mode 100644 youtube_dlc/extractor/plays.py create mode 100644 youtube_dlc/extractor/playtvak.py create mode 100644 youtube_dlc/extractor/playvid.py create mode 100644 youtube_dlc/extractor/playwire.py create mode 100644 youtube_dlc/extractor/pluralsight.py create mode 100644 youtube_dlc/extractor/podomatic.py create mode 100644 youtube_dlc/extractor/pokemon.py create mode 100644 youtube_dlc/extractor/polskieradio.py create mode 100644 youtube_dlc/extractor/popcorntimes.py create mode 100644 youtube_dlc/extractor/popcorntv.py create mode 100644 youtube_dlc/extractor/porn91.py create mode 100644 youtube_dlc/extractor/porncom.py create mode 100644 youtube_dlc/extractor/pornhd.py create mode 100644 youtube_dlc/extractor/pornhub.py create mode 100644 youtube_dlc/extractor/pornotube.py create mode 100644 youtube_dlc/extractor/pornovoisines.py create mode 100644 youtube_dlc/extractor/pornoxo.py create mode 100644 youtube_dlc/extractor/presstv.py create mode 100644 youtube_dlc/extractor/prosiebensat1.py create mode 100644 youtube_dlc/extractor/puhutv.py create mode 100644 youtube_dlc/extractor/puls4.py create mode 100644 youtube_dlc/extractor/pyvideo.py create mode 100644 youtube_dlc/extractor/qqmusic.py create mode 100644 youtube_dlc/extractor/r7.py create mode 100644 youtube_dlc/extractor/radiobremen.py create mode 100644 youtube_dlc/extractor/radiocanada.py create mode 100644 youtube_dlc/extractor/radiode.py create mode 100644 youtube_dlc/extractor/radiofrance.py create mode 100644 youtube_dlc/extractor/radiojavan.py create mode 100644 youtube_dlc/extractor/rai.py create mode 100644 youtube_dlc/extractor/raywenderlich.py create mode 100644 youtube_dlc/extractor/rbmaradio.py create mode 100644 youtube_dlc/extractor/rds.py create mode 100644 youtube_dlc/extractor/redbulltv.py create mode 100644 youtube_dlc/extractor/reddit.py create mode 100644 youtube_dlc/extractor/redtube.py create mode 100644 youtube_dlc/extractor/regiotv.py create mode 100644 youtube_dlc/extractor/rentv.py create mode 100644 youtube_dlc/extractor/restudy.py create mode 100644 youtube_dlc/extractor/reuters.py create mode 100644 youtube_dlc/extractor/reverbnation.py create mode 100644 youtube_dlc/extractor/rice.py create mode 100644 youtube_dlc/extractor/rmcdecouverte.py create mode 100644 youtube_dlc/extractor/ro220.py create mode 100644 youtube_dlc/extractor/rockstargames.py create mode 100644 youtube_dlc/extractor/roosterteeth.py create mode 100644 youtube_dlc/extractor/rottentomatoes.py create mode 100644 youtube_dlc/extractor/roxwel.py create mode 100644 youtube_dlc/extractor/rozhlas.py create mode 100644 youtube_dlc/extractor/rtbf.py create mode 100644 youtube_dlc/extractor/rte.py create mode 100644 youtube_dlc/extractor/rtl2.py create mode 100644 youtube_dlc/extractor/rtlnl.py create mode 100644 youtube_dlc/extractor/rtp.py create mode 100644 youtube_dlc/extractor/rts.py create mode 100644 youtube_dlc/extractor/rtve.py create mode 100644 youtube_dlc/extractor/rtvnh.py create mode 100644 youtube_dlc/extractor/rtvs.py create mode 100644 youtube_dlc/extractor/ruhd.py create mode 100644 youtube_dlc/extractor/rutube.py create mode 100644 youtube_dlc/extractor/rutv.py create mode 100644 youtube_dlc/extractor/ruutu.py create mode 100644 youtube_dlc/extractor/ruv.py create mode 100644 youtube_dlc/extractor/safari.py create mode 100644 youtube_dlc/extractor/sapo.py create mode 100644 youtube_dlc/extractor/savefrom.py create mode 100644 youtube_dlc/extractor/sbs.py create mode 100644 youtube_dlc/extractor/screencast.py create mode 100644 youtube_dlc/extractor/screencastomatic.py create mode 100644 youtube_dlc/extractor/scrippsnetworks.py create mode 100644 youtube_dlc/extractor/scte.py create mode 100644 youtube_dlc/extractor/seeker.py create mode 100644 youtube_dlc/extractor/senateisvp.py create mode 100644 youtube_dlc/extractor/sendtonews.py create mode 100644 youtube_dlc/extractor/servus.py create mode 100644 youtube_dlc/extractor/sevenplus.py create mode 100644 youtube_dlc/extractor/sexu.py create mode 100644 youtube_dlc/extractor/seznamzpravy.py create mode 100644 youtube_dlc/extractor/shahid.py create mode 100644 youtube_dlc/extractor/shared.py create mode 100644 youtube_dlc/extractor/showroomlive.py create mode 100644 youtube_dlc/extractor/sina.py create mode 100644 youtube_dlc/extractor/sixplay.py create mode 100644 youtube_dlc/extractor/sky.py create mode 100644 youtube_dlc/extractor/skylinewebcams.py create mode 100644 youtube_dlc/extractor/skynewsarabia.py create mode 100644 youtube_dlc/extractor/slideshare.py create mode 100644 youtube_dlc/extractor/slideslive.py create mode 100644 youtube_dlc/extractor/slutload.py create mode 100644 youtube_dlc/extractor/smotri.py create mode 100644 youtube_dlc/extractor/snotr.py create mode 100644 youtube_dlc/extractor/sohu.py create mode 100644 youtube_dlc/extractor/sonyliv.py create mode 100644 youtube_dlc/extractor/soundcloud.py create mode 100644 youtube_dlc/extractor/soundgasm.py create mode 100644 youtube_dlc/extractor/southpark.py create mode 100644 youtube_dlc/extractor/spankbang.py create mode 100644 youtube_dlc/extractor/spankwire.py create mode 100644 youtube_dlc/extractor/spiegel.py create mode 100644 youtube_dlc/extractor/spiegeltv.py create mode 100644 youtube_dlc/extractor/spike.py create mode 100644 youtube_dlc/extractor/sport5.py create mode 100644 youtube_dlc/extractor/sportbox.py create mode 100644 youtube_dlc/extractor/sportdeutschland.py create mode 100644 youtube_dlc/extractor/springboardplatform.py create mode 100644 youtube_dlc/extractor/sprout.py create mode 100644 youtube_dlc/extractor/srgssr.py create mode 100644 youtube_dlc/extractor/srmediathek.py create mode 100644 youtube_dlc/extractor/stanfordoc.py create mode 100644 youtube_dlc/extractor/steam.py create mode 100644 youtube_dlc/extractor/stitcher.py create mode 100644 youtube_dlc/extractor/storyfire.py create mode 100644 youtube_dlc/extractor/streamable.py create mode 100644 youtube_dlc/extractor/streamcloud.py create mode 100644 youtube_dlc/extractor/streamcz.py create mode 100644 youtube_dlc/extractor/streetvoice.py create mode 100644 youtube_dlc/extractor/stretchinternet.py create mode 100644 youtube_dlc/extractor/stv.py create mode 100644 youtube_dlc/extractor/sunporno.py create mode 100644 youtube_dlc/extractor/sverigesradio.py create mode 100644 youtube_dlc/extractor/svt.py create mode 100644 youtube_dlc/extractor/swrmediathek.py create mode 100644 youtube_dlc/extractor/syfy.py create mode 100644 youtube_dlc/extractor/sztvhu.py create mode 100644 youtube_dlc/extractor/tagesschau.py create mode 100644 youtube_dlc/extractor/tass.py create mode 100644 youtube_dlc/extractor/tastytrade.py create mode 100644 youtube_dlc/extractor/tbs.py create mode 100644 youtube_dlc/extractor/tdslifeway.py create mode 100644 youtube_dlc/extractor/teachable.py create mode 100644 youtube_dlc/extractor/teachertube.py create mode 100644 youtube_dlc/extractor/teachingchannel.py create mode 100644 youtube_dlc/extractor/teamcoco.py create mode 100644 youtube_dlc/extractor/teamtreehouse.py create mode 100644 youtube_dlc/extractor/techtalks.py create mode 100644 youtube_dlc/extractor/ted.py create mode 100644 youtube_dlc/extractor/tele13.py create mode 100644 youtube_dlc/extractor/tele5.py create mode 100644 youtube_dlc/extractor/telebruxelles.py create mode 100644 youtube_dlc/extractor/telecinco.py create mode 100644 youtube_dlc/extractor/telegraaf.py create mode 100644 youtube_dlc/extractor/telemb.py create mode 100644 youtube_dlc/extractor/telequebec.py create mode 100644 youtube_dlc/extractor/teletask.py create mode 100644 youtube_dlc/extractor/telewebion.py create mode 100644 youtube_dlc/extractor/tennistv.py create mode 100644 youtube_dlc/extractor/tenplay.py create mode 100644 youtube_dlc/extractor/testurl.py create mode 100644 youtube_dlc/extractor/tf1.py create mode 100644 youtube_dlc/extractor/tfo.py create mode 100644 youtube_dlc/extractor/theintercept.py create mode 100644 youtube_dlc/extractor/theplatform.py create mode 100644 youtube_dlc/extractor/thescene.py create mode 100644 youtube_dlc/extractor/thestar.py create mode 100644 youtube_dlc/extractor/thesun.py create mode 100644 youtube_dlc/extractor/theweatherchannel.py create mode 100644 youtube_dlc/extractor/thisamericanlife.py create mode 100644 youtube_dlc/extractor/thisav.py create mode 100644 youtube_dlc/extractor/thisoldhouse.py create mode 100644 youtube_dlc/extractor/threeqsdn.py create mode 100644 youtube_dlc/extractor/tiktok.py create mode 100644 youtube_dlc/extractor/tinypic.py create mode 100644 youtube_dlc/extractor/tmz.py create mode 100644 youtube_dlc/extractor/tnaflix.py create mode 100644 youtube_dlc/extractor/toggle.py create mode 100644 youtube_dlc/extractor/tonline.py create mode 100644 youtube_dlc/extractor/toongoggles.py create mode 100644 youtube_dlc/extractor/toutv.py create mode 100644 youtube_dlc/extractor/toypics.py create mode 100644 youtube_dlc/extractor/traileraddict.py create mode 100644 youtube_dlc/extractor/trilulilu.py create mode 100644 youtube_dlc/extractor/trunews.py create mode 100644 youtube_dlc/extractor/trutv.py create mode 100644 youtube_dlc/extractor/tube8.py create mode 100644 youtube_dlc/extractor/tubitv.py create mode 100644 youtube_dlc/extractor/tudou.py create mode 100644 youtube_dlc/extractor/tumblr.py create mode 100644 youtube_dlc/extractor/tunein.py create mode 100644 youtube_dlc/extractor/tunepk.py create mode 100644 youtube_dlc/extractor/turbo.py create mode 100644 youtube_dlc/extractor/turner.py create mode 100644 youtube_dlc/extractor/tv2.py create mode 100644 youtube_dlc/extractor/tv2dk.py create mode 100644 youtube_dlc/extractor/tv2hu.py create mode 100644 youtube_dlc/extractor/tv4.py create mode 100644 youtube_dlc/extractor/tv5mondeplus.py create mode 100644 youtube_dlc/extractor/tva.py create mode 100644 youtube_dlc/extractor/tvanouvelles.py create mode 100644 youtube_dlc/extractor/tvc.py create mode 100644 youtube_dlc/extractor/tvigle.py create mode 100644 youtube_dlc/extractor/tvland.py create mode 100644 youtube_dlc/extractor/tvn24.py create mode 100644 youtube_dlc/extractor/tvnet.py create mode 100644 youtube_dlc/extractor/tvnoe.py create mode 100644 youtube_dlc/extractor/tvnow.py create mode 100644 youtube_dlc/extractor/tvp.py create mode 100644 youtube_dlc/extractor/tvplay.py create mode 100644 youtube_dlc/extractor/tvplayer.py create mode 100644 youtube_dlc/extractor/tweakers.py create mode 100644 youtube_dlc/extractor/twentyfourvideo.py create mode 100644 youtube_dlc/extractor/twentymin.py create mode 100644 youtube_dlc/extractor/twentythreevideo.py create mode 100644 youtube_dlc/extractor/twitcasting.py create mode 100644 youtube_dlc/extractor/twitch.py create mode 100644 youtube_dlc/extractor/twitter.py create mode 100644 youtube_dlc/extractor/udemy.py create mode 100644 youtube_dlc/extractor/udn.py create mode 100644 youtube_dlc/extractor/ufctv.py create mode 100644 youtube_dlc/extractor/uktvplay.py create mode 100644 youtube_dlc/extractor/umg.py create mode 100644 youtube_dlc/extractor/unistra.py create mode 100644 youtube_dlc/extractor/unity.py create mode 100644 youtube_dlc/extractor/uol.py create mode 100644 youtube_dlc/extractor/uplynk.py create mode 100644 youtube_dlc/extractor/urort.py create mode 100644 youtube_dlc/extractor/urplay.py create mode 100644 youtube_dlc/extractor/usanetwork.py create mode 100644 youtube_dlc/extractor/usatoday.py create mode 100644 youtube_dlc/extractor/ustream.py create mode 100644 youtube_dlc/extractor/ustudio.py create mode 100644 youtube_dlc/extractor/varzesh3.py create mode 100644 youtube_dlc/extractor/vbox7.py create mode 100644 youtube_dlc/extractor/veehd.py create mode 100644 youtube_dlc/extractor/veoh.py create mode 100644 youtube_dlc/extractor/vesti.py create mode 100644 youtube_dlc/extractor/vevo.py create mode 100644 youtube_dlc/extractor/vgtv.py create mode 100644 youtube_dlc/extractor/vh1.py create mode 100644 youtube_dlc/extractor/vice.py create mode 100644 youtube_dlc/extractor/vidbit.py create mode 100644 youtube_dlc/extractor/viddler.py create mode 100644 youtube_dlc/extractor/videa.py create mode 100644 youtube_dlc/extractor/videodetective.py create mode 100644 youtube_dlc/extractor/videofyme.py create mode 100644 youtube_dlc/extractor/videomore.py create mode 100644 youtube_dlc/extractor/videopress.py create mode 100644 youtube_dlc/extractor/vidio.py create mode 100644 youtube_dlc/extractor/vidlii.py create mode 100644 youtube_dlc/extractor/vidme.py create mode 100644 youtube_dlc/extractor/vidzi.py create mode 100644 youtube_dlc/extractor/vier.py create mode 100644 youtube_dlc/extractor/viewlift.py create mode 100644 youtube_dlc/extractor/viidea.py create mode 100644 youtube_dlc/extractor/viki.py create mode 100644 youtube_dlc/extractor/vimeo.py create mode 100644 youtube_dlc/extractor/vimple.py create mode 100644 youtube_dlc/extractor/vine.py create mode 100644 youtube_dlc/extractor/viqeo.py create mode 100644 youtube_dlc/extractor/viu.py create mode 100644 youtube_dlc/extractor/vk.py create mode 100644 youtube_dlc/extractor/vlive.py create mode 100644 youtube_dlc/extractor/vodlocker.py create mode 100644 youtube_dlc/extractor/vodpl.py create mode 100644 youtube_dlc/extractor/vodplatform.py create mode 100644 youtube_dlc/extractor/voicerepublic.py create mode 100644 youtube_dlc/extractor/voot.py create mode 100644 youtube_dlc/extractor/voxmedia.py create mode 100644 youtube_dlc/extractor/vrak.py create mode 100644 youtube_dlc/extractor/vrt.py create mode 100644 youtube_dlc/extractor/vrv.py create mode 100644 youtube_dlc/extractor/vshare.py create mode 100644 youtube_dlc/extractor/vube.py create mode 100644 youtube_dlc/extractor/vuclip.py create mode 100644 youtube_dlc/extractor/vvvvid.py create mode 100644 youtube_dlc/extractor/vyborymos.py create mode 100644 youtube_dlc/extractor/vzaar.py create mode 100644 youtube_dlc/extractor/wakanim.py create mode 100644 youtube_dlc/extractor/walla.py create mode 100644 youtube_dlc/extractor/washingtonpost.py create mode 100644 youtube_dlc/extractor/wat.py create mode 100644 youtube_dlc/extractor/watchbox.py create mode 100644 youtube_dlc/extractor/watchindianporn.py create mode 100644 youtube_dlc/extractor/wdr.py create mode 100644 youtube_dlc/extractor/webcaster.py create mode 100644 youtube_dlc/extractor/webofstories.py create mode 100644 youtube_dlc/extractor/weibo.py create mode 100644 youtube_dlc/extractor/weiqitv.py create mode 100644 youtube_dlc/extractor/wistia.py create mode 100644 youtube_dlc/extractor/worldstarhiphop.py create mode 100644 youtube_dlc/extractor/wsj.py create mode 100644 youtube_dlc/extractor/wwe.py create mode 100644 youtube_dlc/extractor/xbef.py create mode 100644 youtube_dlc/extractor/xboxclips.py create mode 100644 youtube_dlc/extractor/xfileshare.py create mode 100644 youtube_dlc/extractor/xhamster.py create mode 100644 youtube_dlc/extractor/xiami.py create mode 100644 youtube_dlc/extractor/ximalaya.py create mode 100644 youtube_dlc/extractor/xminus.py create mode 100644 youtube_dlc/extractor/xnxx.py create mode 100644 youtube_dlc/extractor/xstream.py create mode 100644 youtube_dlc/extractor/xtube.py create mode 100644 youtube_dlc/extractor/xuite.py create mode 100644 youtube_dlc/extractor/xvideos.py create mode 100644 youtube_dlc/extractor/xxxymovies.py create mode 100644 youtube_dlc/extractor/yahoo.py create mode 100644 youtube_dlc/extractor/yandexdisk.py create mode 100644 youtube_dlc/extractor/yandexmusic.py create mode 100644 youtube_dlc/extractor/yandexvideo.py create mode 100644 youtube_dlc/extractor/yapfiles.py create mode 100644 youtube_dlc/extractor/yesjapan.py create mode 100644 youtube_dlc/extractor/yinyuetai.py create mode 100644 youtube_dlc/extractor/ynet.py create mode 100644 youtube_dlc/extractor/youjizz.py create mode 100644 youtube_dlc/extractor/youku.py create mode 100644 youtube_dlc/extractor/younow.py create mode 100644 youtube_dlc/extractor/youporn.py create mode 100644 youtube_dlc/extractor/yourporn.py create mode 100644 youtube_dlc/extractor/yourupload.py create mode 100644 youtube_dlc/extractor/youtube.py create mode 100644 youtube_dlc/extractor/zapiks.py create mode 100644 youtube_dlc/extractor/zaq1.py create mode 100644 youtube_dlc/extractor/zattoo.py create mode 100644 youtube_dlc/extractor/zdf.py create mode 100644 youtube_dlc/extractor/zingmp3.py create mode 100644 youtube_dlc/extractor/zype.py create mode 100644 youtube_dlc/jsinterp.py create mode 100644 youtube_dlc/options.py create mode 100644 youtube_dlc/postprocessor/__init__.py create mode 100644 youtube_dlc/postprocessor/common.py create mode 100644 youtube_dlc/postprocessor/embedthumbnail.py create mode 100644 youtube_dlc/postprocessor/execafterdownload.py create mode 100644 youtube_dlc/postprocessor/ffmpeg.py create mode 100644 youtube_dlc/postprocessor/metadatafromtitle.py create mode 100644 youtube_dlc/postprocessor/xattrpp.py create mode 100644 youtube_dlc/socks.py create mode 100644 youtube_dlc/swfinterp.py create mode 100644 youtube_dlc/update.py create mode 100644 youtube_dlc/utils.py create mode 100644 youtube_dlc/version.py diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..b507cb8 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,248 @@ +Ricardo Garcia Gonzalez +Danny Colligan +Benjamin Johnson +Vasyl' Vavrychuk +Witold Baryluk +Paweł Paprota +Gergely Imreh +Rogério Brito +Philipp Hagemeister +Sören Schulze +Kevin Ngo +Ori Avtalion +shizeeg +Filippo Valsorda +Christian Albrecht +Dave Vasilevsky +Jaime Marquínez Ferrándiz +Jeff Crouse +Osama Khalid +Michael Walter +M. Yasoob Ullah Khalid +Julien Fraichard +Johny Mo Swag +Axel Noack +Albert Kim +Pierre Rudloff +Huarong Huo +Ismael Mejía +Steffan Donal +Andras Elso +Jelle van der Waa +Marcin Cieślak +Anton Larionov +Takuya Tsuchida +Sergey M. +Michael Orlitzky +Chris Gahan +Saimadhav Heblikar +Mike Col +Oleg Prutz +pulpe +Andreas Schmitz +Michael Kaiser +Niklas Laxström +David Triendl +Anthony Weems +David Wagner +Juan C. Olivares +Mattias Harrysson +phaer +Sainyam Kapoor +Nicolas Évrard +Jason Normore +Hoje Lee +Adam Thalhammer +Georg Jähnig +Ralf Haring +Koki Takahashi +Ariset Llerena +Adam Malcontenti-Wilson +Tobias Bell +Naglis Jonaitis +Charles Chen +Hassaan Ali +Dobrosław Żybort +David Fabijan +Sebastian Haas +Alexander Kirk +Erik Johnson +Keith Beckman +Ole Ernst +Aaron McDaniel (mcd1992) +Magnus Kolstad +Hari Padmanaban +Carlos Ramos +5moufl +lenaten +Dennis Scheiba +Damon Timm +winwon +Xavier Beynon +Gabriel Schubiner +xantares +Jan Matějka +Mauroy Sébastien +William Sewell +Dao Hoang Son +Oskar Jauch +Matthew Rayfield +t0mm0 +Tithen-Firion +Zack Fernandes +cryptonaut +Adrian Kretz +Mathias Rav +Petr Kutalek +Will Glynn +Max Reimann +Cédric Luthi +Thijs Vermeir +Joel Leclerc +Christopher Krooss +Ondřej Caletka +Dinesh S +Johan K. Jensen +Yen Chi Hsuan +Enam Mijbah Noor +David Luhmer +Shaya Goldberg +Paul Hartmann +Frans de Jonge +Robin de Rooij +Ryan Schmidt +Leslie P. Polzer +Duncan Keall +Alexander Mamay +Devin J. Pohly +Eduardo Ferro Aldama +Jeff Buchbinder +Amish Bhadeshia +Joram Schrijver +Will W. +Mohammad Teimori Pabandi +Roman Le Négrate +Matthias Küch +Julian Richen +Ping O. +Mister Hat +Peter Ding +jackyzy823 +George Brighton +Remita Amine +Aurélio A. Heckert +Bernhard Minks +sceext +Zach Bruggeman +Tjark Saul +slangangular +Behrouz Abbasi +ngld +nyuszika7h +Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský +Qijiang Fan +Rémy Léone +Marco Ferragina +reiv +Muratcan Simsek +Evan Lu +flatgreen +Brian Foley +Vignesh Venkat +Tom Gijselinck +Founder Fang +Andrew Alexeyew +Saso Bezlaj +Erwin de Haan +Jens Wille +Robin Houtevelts +Patrick Griffis +Aidan Rowe +mutantmonkey +Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi +Philip Huppert +blahgeek +Kevin Deldycke +inondle +Tomáš Čech +Déstin Reed +Roman Tsiupa +Artur Krysiak +Jakub Adam Wieczorek +Aleksandar Topuzović +Nehal Patel +Rob van Bekkum +Petr Zvoníček +Pratyush Singh +Aleksander Nitecki +Sebastian Blunt +Matěj Cepl +Xie Yanbo +Philip Xu +John Hawkinson +Rich Leeper +Zhong Jianxin +Thor77 +Mattias Wadman +Arjan Verwer +Costy Petrisor +Logan B +Alex Seiler +Vijay Singh +Paul Hartmann +Stephen Chen +Fabian Stahl +Bagira +Odd Stråbø +Philip Herzog +Thomas Christlieb +Marek Rusinowski +Tobias Gruetzmacher +Olivier Bilodeau +Lars Vierbergen +Juanjo Benages +Xiao Di Guan +Thomas Winant +Daniel Twardowski +Jeremie Jarosh +Gerard Rovira +Marvin Ewald +Frédéric Bournival +Timendum +gritstub +Adam Voss +Mike Fährmann +Jan Kundrát +Giuseppe Fabiano +Örn Guðjónsson +Parmjit Virk +Genki Sky +Ľuboš Katrinec +Corey Nicholson +Ashutosh Chaudhary +John Dong +Tatsuyuki Ishi +Daniel Weber +Kay Bouché +Yang Hongbo +Lei Wang +Petr Novák +Leonardo Taccari +Martin Weinelt +Surya Oktafendri +TingPing +Alexandre Macabies +Bastian de Groot +Niklas Haas +András Veres-Szentkirályi +Enes Solak +Nathan Rossi +Thomas van der Berg +Luca Cherubin diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..9b52b7b --- /dev/null +++ b/ChangeLog @@ -0,0 +1,5294 @@ +version 2020.09.20 + +Core +* [extractor/common] Relax interaction count extraction in _json_ld ++ [extractor/common] Extract author as uploader for VideoObject in _json_ld +* [downloader/hls] Fix incorrect end byte in Range HTTP header for + media segments with EXT-X-BYTERANGE (#14748, #24512) +* [extractor/common] Handle ssl.CertificateError in _request_webpage (#26601) +* [downloader/http] Improve timeout detection when reading block of data + (#10935) +* [downloader/http] Retry download when urlopen times out (#10935, #26603) + +Extractors +* [redtube] Extend URL regular expression (#26506) +* [twitch] Refactor +* [twitch:stream] Switch to GraphQL and fix reruns (#26535) ++ [telequebec] Add support for brightcove videos (#25833) +* [pornhub] Extract metadata from JSON-LD (#26614) +* [pornhub] Fix view count extraction (#26621, #26614) + + +version 2020.09.14 + +Core ++ [postprocessor/embedthumbnail] Add support for non jpg/png thumbnails + (#25687, #25717) + +Extractors +* [rtlnl] Extend URL regular expression (#26549, #25821) +* [youtube] Fix empty description extraction (#26575, #26006) +* [srgssr] Extend URL regular expression (#26555, #26556, #26578) +* [googledrive] Use redirect URLs for source format (#18877, #23919, #24689, + #26565) +* [svtplay] Fix id extraction (#26576) +* [redbulltv] Improve support for rebull.com TV localized URLs (#22063) ++ [redbulltv] Add support for new redbull.com TV URLs (#22037, #22063) +* [soundcloud:pagedplaylist] Reduce pagination limit (#26557) + + +version 2020.09.06 + +Core ++ [utils] Recognize wav mimetype (#26463) + +Extractors +* [nrktv:episode] Improve video id extraction (#25594, #26369, #26409) +* [youtube] Fix age gate content detection (#26100, #26152, #26311, #26384) +* [youtube:user] Extend URL regular expression (#26443) +* [xhamster] Improve initials regular expression (#26526, #26353) +* [svtplay] Fix video id extraction (#26425, #26428, #26438) +* [twitch] Rework extractors (#12297, #20414, #20604, #21811, #21812, #22979, + #24263, #25010, #25553, #25606) + * Switch to GraphQL + + Add support for collections + + Add support for clips and collections playlists +* [biqle] Improve video ext extraction +* [xhamster] Fix extraction (#26157, #26254) +* [xhamster] Extend URL regular expression (#25789, #25804, #25927)) + + +version 2020.07.28 + +Extractors +* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) +* [youtube] Improve description extraction (#25937, #25980) +* [wistia] Restrict embed regular expression (#25969) +* [youtube] Prevent excess HTTP 301 (#25786) ++ [youtube:playlists] Extend URL regular expression (#25810) ++ [bellmedia] Add support for cp24.com clip URLs (#25764) +* [brightcove] Improve embed detection (#25674) + + +version 2020.06.16.1 + +Extractors +* [youtube] Force old layout (#25682, #25683, #25680, #25686) +* [youtube] Fix categories and improve tags extraction + + +version 2020.06.16 + +Extractors +* [youtube] Fix uploader id and uploader URL extraction +* [youtube] Improve view count extraction +* [youtube] Fix upload date extraction (#25677) +* [youtube] Fix thumbnails extraction (#25676) +* [youtube] Fix playlist and feed extraction (#25675) ++ [facebook] Add support for single-video ID links ++ [youtube] Extract chapters from JSON (#24819) ++ [kaltura] Add support for multiple embeds on a webpage (#25523) + + +version 2020.06.06 + +Extractors +* [tele5] Bypass geo restriction ++ [jwplatform] Add support for bypass geo restriction +* [tele5] Prefer jwplatform over nexx (#25533) +* [twitch:stream] Expect 400 and 410 HTTP errors from API +* [twitch:stream] Fix extraction (#25528) +* [twitch] Fix thumbnails extraction (#25531) ++ [twitch] Pass v5 Accept HTTP header (#25531) +* [brightcove] Fix subtitles extraction (#25540) ++ [malltv] Add support for sk.mall.tv (#25445) +* [periscope] Fix untitled broadcasts (#25482) +* [jwplatform] Improve embeds extraction (#25467) + + +version 2020.05.29 + +Core +* [postprocessor/ffmpeg] Embed series metadata with --add-metadata +* [utils] Fix file permissions in write_json_file (#12471, #25122) + +Extractors +* [ard:beta] Extend URL regular expression (#25405) ++ [youtube] Add support for more invidious instances (#25417) +* [giantbomb] Extend URL regular expression (#25222) +* [ard] Improve URL regular expression (#25134, #25198) +* [redtube] Improve formats extraction and extract m3u8 formats (#25311, + #25321) +* [indavideo] Switch to HTTPS for API request (#25191) +* [redtube] Improve title extraction (#25208) +* [vimeo] Improve format extraction and sorting (#25285) +* [soundcloud] Reduce API playlist page limit (#25274) ++ [youtube] Add support for yewtu.be (#25226) +* [mailru] Fix extraction (#24530, #25239) +* [bellator] Fix mgid extraction (#25195) + + +version 2020.05.08 + +Core +* [downloader/http] Request last data block of exact remaining size +* [downloader/http] Finish downloading once received data length matches + expected +* [extractor/common] Use compat_cookiejar_Cookie for _set_cookie to always + ensure cookie name and value are bytestrings on python 2 (#23256, #24776) ++ [compat] Introduce compat_cookiejar_Cookie +* [utils] Improve cookie files support + + Add support for UTF-8 in cookie files + * Skip malformed cookie file entries instead of crashing (invalid entry + length, invalid expires at) + +Extractors +* [youtube] Improve signature cipher extraction (#25187, #25188) +* [iprima] Improve extraction (#25138) +* [uol] Fix extraction (#22007) ++ [orf] Add support for more radio stations (#24938, #24968) +* [dailymotion] Fix typo +- [puhutv] Remove no longer available HTTP formats (#25124) + + +version 2020.05.03 + +Core ++ [extractor/common] Extract multiple JSON-LD entries +* [options] Clarify doc on --exec command (#19087, #24883) +* [extractor/common] Skip malformed ISM manifest XMLs while extracting + ISM formats (#24667) + +Extractors +* [crunchyroll] Fix and improve extraction (#25096, #25060) +* [youtube] Improve player id extraction +* [youtube] Use redirected video id if any (#25063) +* [yahoo] Fix GYAO Player extraction and relax URL regular expression + (#24178, #24778) +* [tvplay] Fix Viafree extraction (#15189, #24473, #24789) +* [tenplay] Relax URL regular expression (#25001) ++ [prosiebensat1] Extract series metadata +* [prosiebensat1] Improve extraction and remove 7tv.de support (#24948) +- [prosiebensat1] Remove 7tv.de support (#24948) +* [youtube] Fix DRM videos detection (#24736) +* [thisoldhouse] Fix video id extraction (#24548, #24549) ++ [soundcloud] Extract AAC format (#19173, #24708) +* [youtube] Skip broken multifeed videos (#24711) +* [nova:embed] Fix extraction (#24700) +* [motherless] Fix extraction (#24699) +* [twitch:clips] Extend URL regular expression (#24290, #24642) +* [tv4] Fix ISM formats extraction (#24667) +* [tele5] Fix extraction (#24553) ++ [mofosex] Add support for generic embeds (#24633) ++ [youporn] Add support for generic embeds ++ [spankwire] Add support for generic embeds (#24633) +* [spankwire] Fix extraction (#18924, #20648) + + +version 2020.03.24 + +Core +- [utils] Revert support for cookie files with spaces used instead of tabs + +Extractors +* [teachable] Update upskillcourses and gns3 domains +* [generic] Look for teachable embeds before wistia ++ [teachable] Extract chapter metadata (#24421) ++ [bilibili] Add support for player.bilibili.com (#24402) ++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442) +* [limelight] Remove disabled API requests (#24255) +* [soundcloud] Fix download URL extraction (#24394) ++ [cbc:watch] Add support for authentication (#19160) +* [hellporno] Fix extraction (#24399) +* [xtube] Fix formats extraction (#24348) +* [ndr] Fix extraction (#24326) +* [nhk] Update m3u8 URL and use native HLS downloader (#24329) +- [nhk] Remove obsolete rtmp formats (#24329) +* [nhk] Relax URL regular expression (#24329) +- [vimeo] Revert fix showcase password protected video extraction (#24224) + + +version 2020.03.08 + +Core ++ [utils] Add support for cookie files with spaces used instead of tabs + +Extractors ++ [pornhub] Add support for pornhubpremium.com (#24288) +- [youtube] Remove outdated code and unnecessary requests +* [youtube] Improve extraction in 429 HTTP error conditions (#24283) +* [nhk] Update API version (#24270) + + +version 2020.03.06 + +Extractors +* [youtube] Fix age-gated videos support without login (#24248) +* [vimeo] Fix showcase password protected video extraction (#24224) +* [pornhub] Improve title extraction (#24184) +* [peertube] Improve extraction (#23657) ++ [servus] Add support for new URL schema (#23475, #23583, #24142) +* [vimeo] Fix subtitles URLs (#24209) + + +version 2020.03.01 + +Core +* [YoutubeDL] Force redirect URL to unicode on python 2 +- [options] Remove duplicate short option -v for --version (#24162) + +Extractors +* [xhamster] Fix extraction (#24205) +* [franceculture] Fix extraction (#24204) ++ [telecinco] Add support for article opening videos +* [telecinco] Fix extraction (#24195) +* [xtube] Fix metadata extraction (#21073, #22455) +* [youjizz] Fix extraction (#24181) +- Remove no longer needed compat_str around geturl +* [pornhd] Fix extraction (#24128) ++ [teachable] Add support for multiple videos per lecture (#24101) ++ [wistia] Add support for multiple generic embeds (#8347, 11385) +* [imdb] Fix extraction (#23443) +* [tv2dk:bornholm:play] Fix extraction (#24076) + + +version 2020.02.16 + +Core +* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, + #10622) +* [update] Fix updating via symlinks (#23991) ++ [compat] Introduce compat_realpath (#23991) + +Extractors ++ [npr] Add support for streams (#24042) ++ [24video] Add support for porn.24video.net (#23779, #23784) +- [jpopsuki] Remove extractor (#23858) +* [nova] Improve extraction (#23690) +* [nova:embed] Improve (#23690) +* [nova:embed] Fix extraction (#23672) ++ [abc:iview] Add support for 720p (#22907, #22921) +* [nytimes] Improve format sorting (#24010) ++ [toggle] Add support for mewatch.sg (#23895, #23930) +* [thisoldhouse] Fix extraction (#23951) ++ [popcorntimes] Add support for popcorntimes.tv (#23949) +* [sportdeutschland] Update to new API +* [twitch:stream] Lowercase channel id for stream request (#23917) +* [tv5mondeplus] Fix extraction (#23907, #23911) +* [tva] Relax URL regular expression (#23903) +* [vimeo] Fix album extraction (#23864) +* [viewlift] Improve extraction + * Fix extraction (#23851) + + Add support for authentication + + Add support for more domains +* [svt] Fix series extraction (#22297) +* [svt] Fix article extraction (#22897, #22919) +* [soundcloud] Imporve private playlist/set tracks extraction (#3707) + + +version 2020.01.24 + +Extractors +* [youtube] Fix sigfunc name extraction (#23819) +* [stretchinternet] Fix extraction (#4319) +* [voicerepublic] Fix extraction +* [azmedien] Fix extraction (#23783) +* [businessinsider] Fix jwplatform id extraction (#22929, #22954) ++ [24video] Add support for 24video.vip (#23753) +* [ivi:compilation] Fix entries extraction (#23770) +* [ard] Improve extraction (#23761) + * Simplify extraction + + Extract age limit and series + * Bypass geo-restriction ++ [nbc] Add support for nbc multi network URLs (#23049) +* [americastestkitchen] Fix extraction +* [zype] Improve extraction + + Extract subtitles (#21258) + + Support URLs with alternative keys/tokens (#21258) + + Extract more metadata +* [orf:tvthek] Improve geo restricted videos detection (#23741) +* [soundcloud] Restore previews extraction (#23739) + + +version 2020.01.15 + +Extractors +* [yourporn] Fix extraction (#21645, #22255, #23459) ++ [canvas] Add support for new API endpoint (#17680, #18629) +* [ndr:base:embed] Improve thumbnails extraction (#23731) ++ [vodplatform] Add support for embed.kwikmotion.com domain ++ [twitter] Add support for promo_video_website cards (#23711) +* [orf:radio] Clean description and improve extraction +* [orf:fm4] Fix extraction (#23599) +* [safari] Fix kaltura session extraction (#23679, #23670) +* [lego] Fix extraction and extract subtitle (#23687) +* [cloudflarestream] Improve extraction + + Add support for bytehighway.net domain + + Add support for signed URLs + + Extract thumbnail +* [naver] Improve extraction + * Improve geo-restriction handling + + Extract automatic captions + + Extract uploader metadata + + Extract VLive HLS formats + * Improve metadata extraction +- [pandatv] Remove extractor (#23630) +* [dctp] Fix format extraction (#23656) ++ [scrippsnetworks] Add support for www.discovery.com videos +* [discovery] Fix anonymous token extraction (#23650) +* [nrktv:seriebase] Fix extraction (#23625, #23537) +* [wistia] Improve format extraction and extract subtitles (#22590) +* [vice] Improve extraction (#23631) +* [redtube] Detect private videos (#23518) + + +version 2020.01.01 + +Extractors +* [brightcove] Invalidate policy key cache on failing requests +* [pornhub] Improve locked videos detection (#22449, #22780) ++ [pornhub] Add support for m3u8 formats +* [pornhub] Fix extraction (#22749, #23082) +* [brightcove] Update policy key on failing requests +* [spankbang] Improve removed video detection (#23423) +* [spankbang] Fix extraction (#23307, #23423, #23444) +* [soundcloud] Automatically update client id on failing requests +* [prosiebensat1] Improve geo restriction handling (#23571) +* [brightcove] Cache brightcove player policy keys +* [teachable] Fail with error message if no video URL found +* [teachable] Improve locked lessons detection (#23528) ++ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) +* [mitele] Fix extraction (#21354, #23456) +* [soundcloud] Update client id (#23516) +* [mailru] Relax URL regular expressions (#23509) + + +version 2019.12.25 + +Core +* [utils] Improve str_to_int ++ [downloader/hls] Add ability to override AES decryption key URL (#17521) + +Extractors +* [mediaset] Fix parse formats (#23508) ++ [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) ++ [slideslive] Add support for url and vimeo service names (#23414) +* [slideslive] Fix extraction (#23413) +* [twitch:clips] Fix extraction (#23375) ++ [soundcloud] Add support for token protected embeds (#18954) +* [vk] Improve extraction + * Fix User Videos extraction (#23356) + * Extract all videos for lists with more than 1000 videos (#23356) + + Add support for video albums (#14327, #14492) +- [kontrtube] Remove extractor +- [videopremium] Remove extractor +- [musicplayon] Remove extractor (#9225) ++ [ufctv] Add support for ufcfightpass.imgdge.com and + ufcfightpass.imggaming.com (#23343) ++ [twitch] Extract m3u8 formats frame rate (#23333) ++ [imggaming] Add support for playlists and extract subtitles ++ [ufcarabia] Add support for UFC Arabia (#23312) +* [ufctv] Fix extraction +* [yahoo] Fix gyao brightcove player id (#23303) +* [vzaar] Override AES decryption key URL (#17521) ++ [vzaar] Add support for AES HLS manifests (#17521, #23299) +* [nrl] Fix extraction +* [teachingchannel] Fix extraction +* [nintendo] Fix extraction and partially add support for Nintendo Direct + videos (#4592) ++ [ooyala] Add better fallback values for domain and streams variables ++ [youtube] Add support youtubekids.com (#23272) +* [tv2] Detect DRM protection ++ [tv2] Add support for katsomo.fi and mtv.fi (#10543) +* [tv2] Fix tv2.no article extraction +* [msn] Improve extraction + + Add support for YouTube and NBCSports embeds + + Add support for articles with multiple videos + * Improve AOL embed support + * Improve format extraction +* [abcotvs] Relax URL regular expression and improve metadata extraction + (#18014) +* [channel9] Reduce response size +* [adobetv] Improve extaction + * Use OnDemandPagedList for list extractors + * Reduce show extraction requests + * Extract original video format and subtitles + + Add support for adobe tv embeds + + +version 2019.11.28 + +Core ++ [utils] Add generic caesar cipher and rot47 +* [utils] Handle rd-suffixed day parts in unified_strdate (#23199) + +Extractors +* [vimeo] Improve extraction + * Fix review extraction + * Fix ondemand extraction + * Make password protected player case as an expected error (#22896) + * Simplify channel based extractors code +- [openload] Remove extractor (#11999) +- [verystream] Remove extractor +- [streamango] Remove extractor (#15406) +* [dailymotion] Improve extraction + * Extract http formats included in m3u8 manifest + * Fix user extraction (#3553, #21415) + + Add suport for User Authentication (#11491) + * Fix password protected videos extraction (#23176) + * Respect age limit option and family filter cookie value (#18437) + * Handle video url playlist query param + * Report allowed countries for geo-restricted videos +* [corus] Improve extraction + + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com + and disneylachaine.ca (#20861) + + Add support for self hosted videos (#22075) + * Detect DRM protection (#14910, #9164) +* [vivo] Fix extraction (#22328, #22279) ++ [bitchute] Extract upload date (#22990, #23193) +* [soundcloud] Update client id (#23214) + + +version 2019.11.22 + +Core ++ [extractor/common] Clean jwplayer description HTML tags ++ [extractor/common] Add data, headers and query to all major extract formats + methods + +Extractors +* [chaturbate] Fix extraction (#23010, #23012) ++ [ntvru] Add support for non relative file URLs (#23140) +* [vk] Fix wall audio thumbnails extraction (#23135) +* [ivi] Fix format extraction (#21991) +- [comcarcoff] Remove extractor ++ [drtv] Add support for new URL schema (#23059) ++ [nexx] Add support for Multi Player JS Setup (#23052) ++ [teamcoco] Add support for new videos (#23054) +* [soundcloud] Check if the soundtrack has downloads left (#23045) +* [facebook] Fix posts video data extraction (#22473) +- [addanime] Remove extractor +- [minhateca] Remove extractor +- [daisuki] Remove extractor +* [seeker] Fix extraction +- [revision3] Remove extractors +* [twitch] Fix video comments URL (#18593, #15828) +* [twitter] Improve extraction + + Add support for generic embeds (#22168) + * Always extract http formats for native videos (#14934) + + Add support for Twitter Broadcasts (#21369) + + Extract more metadata + * Improve VMap format extraction + * Unify extraction code for both twitter statuses and cards ++ [twitch] Add support for Clip embed URLs +* [lnkgo] Fix extraction (#16834) +* [mixcloud] Improve extraction + * Improve metadata extraction (#11721) + * Fix playlist extraction (#22378) + * Fix user mixes extraction (#15197, #17865) ++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) +* [onionstudios] Fix extraction ++ [hotstar] Pass Referer header to format requests (#22836) +* [dplay] Minimize response size ++ [patreon] Extract uploader_id and filesize +* [patreon] Minimize response size +* [roosterteeth] Fix login request (#16094, #22689) + + +version 2019.11.05 + +Extractors ++ [scte] Add support for learning.scte.org (#22975) ++ [msn] Add support for Vidible and AOL embeds (#22195, #22227) +* [myspass] Fix video URL extraction and improve metadata extraction (#22448) +* [jamendo] Improve extraction + * Fix album extraction (#18564) + * Improve metadata extraction (#18565, #21379) +* [mediaset] Relax URL guid matching (#18352) ++ [mediaset] Extract unprotected M3U and MPD manifests (#17204) +* [telegraaf] Fix extraction ++ [bellmedia] Add support for marilyn.ca videos (#22193) +* [stv] Fix extraction (#22928) +- [iconosquare] Remove extractor +- [keek] Remove extractor +- [gameone] Remove extractor (#21778) +- [flipagram] Remove extractor +- [bambuser] Remove extractor +* [wistia] Reduce embed extraction false positives ++ [wistia] Add support for inline embeds (#22931) +- [go90] Remove extractor +* [kakao] Remove raw request ++ [kakao] Extract format total bitrate +* [daum] Fix VOD and Clip extracton (#15015) +* [kakao] Improve extraction + + Add support for embed URLs + + Add support for Kakao Legacy vid based embed URLs + * Only extract fields used for extraction + * Strip description and extract tags +* [mixcloud] Fix cloudcast data extraction (#22821) +* [yahoo] Improve extraction + + Add support for live streams (#3597, #3779, #22178) + * Bypass cookie consent page for european domains (#16948, #22576) + + Add generic support for embeds (#20332) +* [tv2] Fix and improve extraction (#22787) ++ [tv2dk] Add support for TV2 DK sites +* [onet] Improve extraction … + + Add support for onet100.vod.pl + + Extract m3u8 formats + * Correct audio only format info +* [fox9] Fix extraction + + +version 2019.10.29 + +Core +* [utils] Actualize major IPv4 address blocks per country + +Extractors ++ [go] Add support for abc.com and freeform.com (#22823, #22864) ++ [mtv] Add support for mtvjapan.com +* [mtv] Fix extraction for mtv.de (#22113) +* [videodetective] Fix extraction +* [internetvideoarchive] Fix extraction +* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) +- [hark] Remove extractor +- [tutv] Remove extractor +- [learnr] Remove extractor +- [macgamestore] Remove extractor +* [la7] Update Kaltura service URL (#22358) +* [thesun] Fix extraction (#16966) +- [makertv] Remove extractor ++ [tenplay] Add support for 10play.com.au (#21446) +* [soundcloud] Improve extraction + * Improve format extraction (#22123) + + Extract uploader_id and uploader_url (#21916) + + Extract all known thumbnails (#19071, #20659) + * Fix extration for private playlists (#20976) + + Add support for playlist embeds (#20976) + * Skip preview formats (#22806) +* [dplay] Improve extraction + + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) + * Fix it.dplay.com extraction (#22826) + + Extract creator, tags and thumbnails + * Handle playback API call errors ++ [discoverynetworks] Add support for dplay.co.uk +* [vk] Improve extraction + + Add support for Odnoklassniki embeds + + Extract more videos from user lists (#4470) + + Fix wall post audio extraction (#18332) + * Improve error detection (#22568) ++ [odnoklassniki] Add support for embeds +* [puhutv] Improve extraction + * Fix subtitles extraction + * Transform HLS URLs to HTTP URLs + * Improve metadata extraction +* [ceskatelevize] Skip DRM media ++ [facebook] Extract subtitles (#22777) +* [globo] Handle alternative hash signing method + + +version 2019.10.22 + +Core +* [utils] Improve subtitles_filename (#22753) + +Extractors +* [facebook] Bypass download rate limits (#21018) ++ [contv] Add support for contv.com +- [viewster] Remove extractor +* [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) + * Update the list of domains + + Add support for aa-encoded video data + * Improve jwplayer format extraction + + Add support for Clappr sources +* [mangomolo] Fix video format extraction and add support for player URLs +* [audioboom] Improve metadata extraction +* [twitch] Update VOD URL matching (#22395, #22727) +- [mit] Remove support for video.mit.edu (#22403) +- [servingsys] Remove extractor (#22639) +* [dumpert] Fix extraction (#22428, #22564) +* [atresplayer] Fix extraction (#16277, #16716) + + +version 2019.10.16 + +Core +* [extractor/common] Make _is_valid_url more relaxed + +Extractors +* [vimeo] Improve album videos id extraction (#22599) ++ [globo] Extract subtitles (#22713) +* [bokecc] Improve player params extraction (#22638) +* [nexx] Handle result list (#22666) +* [vimeo] Fix VHX embed extraction +* [nbc] Switch to graphql API (#18581, #22693, #22701) +- [vessel] Remove extractor +- [promptfile] Remove extractor (#6239) +* [kaltura] Fix service URL extraction (#22658) +* [kaltura] Fix embed info strip (#22658) +* [globo] Fix format extraction (#20319) +* [redtube] Improve metadata extraction (#22492, #22615) +* [pornhub:uservideos:upload] Fix extraction (#22619) ++ [telequebec:squat] Add support for squat.telequebec.tv (#18503) +- [wimp] Remove extractor (#22088, #22091) ++ [gfycat] Extend URL regular expression (#22225) ++ [chaturbate] Extend URL regular expression (#22309) +* [peertube] Update instances (#22414) ++ [telequebec] Add support for coucou.telequebec.tv (#22482) ++ [xvideos] Extend URL regular expression (#22471) +- [youtube] Remove support for invidious.enkirton.net (#22543) ++ [openload] Add support for oload.monster (#22592) +* [nrktv:seriebase] Fix extraction (#22596) ++ [youtube] Add support for yt.lelux.fi (#22597) +* [orf:tvthek] Make manifest requests non fatal (#22578) +* [teachable] Skip login when already logged in (#22572) +* [viewlift] Improve extraction (#22545) +* [nonktube] Fix extraction (#22544) + + +version 2019.09.28 + +Core +* [YoutubeDL] Honour all --get-* options with --flat-playlist (#22493) + +Extractors +* [vk] Fix extraction (#22522) +* [heise] Fix kaltura embeds extraction (#22514) +* [ted] Check for resources validity and extract subtitled downloads (#22513) ++ [youtube] Add support for + owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya.b32.i2p (#22292) ++ [nhk] Add support for clips +* [nhk] Fix video extraction (#22249, #22353) +* [byutv] Fix extraction (#22070) ++ [openload] Add support for oload.online (#22304) ++ [youtube] Add support for invidious.drycat.fr (#22451) +* [jwplatfom] Do not match video URLs (#20596, #22148) +* [youtube:playlist] Unescape playlist uploader (#22483) ++ [bilibili] Add support audio albums and songs (#21094) ++ [instagram] Add support for tv URLs ++ [mixcloud] Allow uppercase letters in format URLs (#19280) +* [brightcove] Delegate all supported legacy URLs to new extractor (#11523, + #12842, #13912, #15669, #16303) +* [hotstar] Use native HLS downloader by default ++ [hotstar] Extract more formats (#22323) +* [9now] Fix extraction (#22361) +* [zdf] Bypass geo restriction ++ [tv4] Extract series metadata +* [tv4] Fix extraction (#22443) + + +version 2019.09.12.1 + +Extractors +* [youtube] Remove quality and tbr for itag 43 (#22372) + + +version 2019.09.12 + +Extractors +* [youtube] Quick extraction tempfix (#22367, #22163) + + +version 2019.09.01 + +Core ++ [extractor/generic] Add support for squarespace embeds (#21294, #21802, + #21859) ++ [downloader/external] Respect mtime option for aria2c (#22242) + +Extractors ++ [xhamster:user] Add support for user pages (#16330, #18454) ++ [xhamster] Add support for more domains ++ [verystream] Add support for woof.tube (#22217) ++ [dailymotion] Add support for lequipe.fr (#21328, #22152) ++ [openload] Add support for oload.vip (#22205) ++ [bbccouk] Extend URL regular expression (#19200) ++ [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) +* [safari] Fix authentication (#22161, #22184) +* [usanetwork] Fix extraction (#22105) ++ [einthusan] Add support for einthusan.ca (#22171) +* [youtube] Improve unavailable message extraction (#22117) ++ [piksel] Extract subtitles (#20506) + + +version 2019.08.13 + +Core +* [downloader/fragment] Fix ETA calculation of resumed download (#21992) +* [YoutubeDL] Check annotations availability (#18582) + +Extractors +* [youtube:playlist] Improve flat extraction (#21927) +* [youtube] Fix annotations extraction (#22045) ++ [discovery] Extract series meta field (#21808) +* [youtube] Improve error detection (#16445) +* [vimeo] Fix album extraction (#1933, #15704, #15855, #18967, #21986) ++ [roosterteeth] Add support for watch URLs +* [discovery] Limit video data by show slug (#21980) + + +version 2019.08.02 + +Extractors ++ [tvigle] Add support for HLS and DASH formats (#21967) +* [tvigle] Fix extraction (#21967) ++ [yandexvideo] Add support for DASH formats (#21971) +* [discovery] Use API call for video data extraction (#21808) ++ [mgtv] Extract format_note (#21881) +* [tvn24] Fix metadata extraction (#21833, #21834) +* [dlive] Relax URL regular expression (#21909) ++ [openload] Add support for oload.best (#21913) +* [youtube] Improve metadata extraction for age gate content (#21943) + + +version 2019.07.30 + +Extractors +* [youtube] Fix and improve title and description extraction (#21934) + + +version 2019.07.27 + +Extractors ++ [yahoo:japannews] Add support for yahoo.co.jp (#21698, #21265) ++ [discovery] Add support go.discovery.com URLs +* [youtube:playlist] Relax video regular expression (#21844) +* [generic] Restrict --default-search schemeless URLs detection pattern + (#21842) +* [vrv] Fix CMS signing query extraction (#21809) + + +version 2019.07.16 + +Extractors ++ [asiancrush] Add support for yuyutv.com, midnightpulp.com and cocoro.tv + (#21281, #21290) +* [kaltura] Check source format URL (#21290) +* [ctsnews] Fix YouTube embeds extraction (#21678) ++ [einthusan] Add support for einthusan.com (#21748, #21775) ++ [youtube] Add support for invidious.mastodon.host (#21777) ++ [gfycat] Extend URL regular expression (#21779, #21780) +* [youtube] Restrict is_live extraction (#21782) + + +version 2019.07.14 + +Extractors +* [porn91] Fix extraction (#21312) ++ [yandexmusic] Extract track number and disk number (#21421) ++ [yandexmusic] Add support for multi disk albums (#21420, #21421) +* [lynda] Handle missing subtitles (#20490, #20513) ++ [youtube] Add more invidious instances to URL regular expression (#21694) +* [twitter] Improve uploader id extraction (#21705) +* [spankbang] Fix and improve metadata extraction +* [spankbang] Fix extraction (#21763, #21764) ++ [dlive] Add support for dlive.tv (#18080) ++ [livejournal] Add support for livejournal.com (#21526) +* [roosterteeth] Fix free episode extraction (#16094) +* [dbtv] Fix extraction +* [bellator] Fix extraction +- [rudo] Remove extractor (#18430, #18474) +* [facebook] Fallback to twitter:image meta for thumbnail extraction (#21224) +* [bleacherreport] Fix Bleacher Report CMS extraction +* [espn] Fix fivethirtyeight.com extraction +* [5tv] Relax video URL regular expression and support https URLs +* [youtube] Fix is_live extraction (#21734) +* [youtube] Fix authentication (#11270) + + +version 2019.07.12 + +Core ++ [adobepass] Add support for AT&T U-verse (mso ATT) (#13938, #21016) + +Extractors ++ [mgtv] Pass Referer HTTP header for format URLs (#21726) ++ [beeg] Add support for api/v6 v2 URLs without t argument (#21701) +* [voxmedia:volume] Improvevox embed extraction (#16846) +* [funnyordie] Move extraction to VoxMedia extractor (#16846) +* [gameinformer] Fix extraction (#8895, #15363, #17206) +* [funk] Fix extraction (#17915) +* [packtpub] Relax lesson URL regular expression (#21695) +* [packtpub] Fix extraction (#21268) +* [philharmoniedeparis] Relax URL regular expression (#21672) +* [peertube] Detect embed URLs in generic extraction (#21666) +* [mixer:vod] Relax URL regular expression (#21657, #21658) ++ [lecturio] Add support id based URLs (#21630) ++ [go] Add site info for disneynow (#21613) +* [ted] Restrict info regular expression (#21631) +* [twitch:vod] Actualize m3u8 URL (#21538, #21607) +* [vzaar] Fix videos with empty title (#21606) +* [tvland] Fix extraction (#21384) +* [arte] Clean extractor (#15583, #21614) + + +version 2019.07.02 + +Core ++ [utils] Introduce random_user_agent and use as default User-Agent (#21546) + +Extractors ++ [vevo] Add support for embed.vevo.com URLs (#21565) ++ [openload] Add support for oload.biz (#21574) +* [xiami] Update API base URL (#21575) +* [yourporn] Fix extraction (#21585) ++ [acast] Add support for URLs with episode id (#21444) ++ [dailymotion] Add support for DM.player embeds +* [soundcloud] Update client id + + +version 2019.06.27 + +Extractors ++ [go] Add support for disneynow.com (#21528) +* [mixer:vod] Relax URL regular expression (#21531, #21536) +* [drtv] Relax URL regular expression +* [fusion] Fix extraction (#17775, #21269) +- [nfb] Remove extractor (#21518) ++ [beeg] Add support for api/v6 v2 URLs (#21511) ++ [brightcove:new] Add support for playlists (#21331) ++ [openload] Add support for oload.life (#21495) +* [vimeo:channel,group] Make title extraction non fatal +* [vimeo:likes] Implement extrator in terms of channel extractor (#21493) ++ [pornhub] Add support for more paged video sources ++ [pornhub] Add support for downloading single pages and search pages (#15570) +* [pornhub] Rework extractors (#11922, #16078, #17454, #17936) ++ [youtube] Add another signature function pattern +* [tf1] Fix extraction (#21365, #21372) +* [crunchyroll] Move Accept-Language workaround to video extractor since + it causes playlists not to list any videos +* [crunchyroll:playlist] Fix and relax title extraction (#21291, #21443) + + +version 2019.06.21 + +Core +* [utils] Restrict parse_codecs and add theora as known vcodec (#21381) + +Extractors +* [youtube] Update signature function patterns (#21469, #21476) +* [youtube] Make --write-annotations non fatal (#21452) ++ [sixplay] Add support for rtlmost.hu (#21405) +* [youtube] Hardcode codec metadata for av01 video only formats (#21381) +* [toutv] Update client key (#21370) ++ [biqle] Add support for new embed domain +* [cbs] Improve DRM protected videos detection (#21339) + + +version 2019.06.08 + +Core +* [downloader/common] Improve rate limit (#21301) +* [utils] Improve strip_or_none +* [extractor/common] Strip src attribute for HTML5 entries code (#18485, + #21169) + +Extractors +* [ted] Fix playlist extraction (#20844, #21032) +* [vlive:playlist] Fix video extraction when no playlist is found (#20590) ++ [vlive] Add CH+ support (#16887, #21209) ++ [openload] Add support for oload.website (#21329) ++ [tvnow] Extract HD formats (#21201) ++ [redbulltv] Add support for rrn:content URLs (#21297) +* [youtube] Fix average rating extraction (#21304) ++ [bitchute] Extract HTML5 formats (#21306) +* [cbsnews] Fix extraction (#9659, #15397) +* [vvvvid] Relax URL regular expression (#21299) ++ [prosiebensat1] Add support for new API (#21272) ++ [vrv] Extract adaptive_hls formats (#21243) +* [viki] Switch to HTTPS (#21001) +* [LiveLeak] Check if the original videos exist (#21206, #21208) +* [rtp] Fix extraction (#15099) +* [youtube] Improve DRM protected videos detection (#1774) ++ [srgssrplay] Add support for popupvideoplayer URLs (#21155) ++ [24video] Add support for porno.24video.net (#21194) ++ [24video] Add support for 24video.site (#21193) +- [pornflip] Remove extractor +- [criterion] Remove extractor (#21195) +* [pornhub] Use HTTPS (#21061) +* [bitchute] Fix uploader extraction (#21076) +* [streamcloud] Reduce waiting time to 6 seconds (#21092) +- [novamov] Remove extractors (#21077) ++ [openload] Add support for oload.press (#21135) +* [vivo] Fix extraction (#18906, #19217) + + +version 2019.05.20 + +Core ++ [extractor/common] Move workaround for applying first Set-Cookie header + into a separate _apply_first_set_cookie_header method + +Extractors +* [safari] Fix authentication (#21090) +* [vk] Use _apply_first_set_cookie_header +* [vrt] Fix extraction (#20527) ++ [canvas] Add support for vrtnieuws and sporza site ids and extract + AES HLS formats ++ [vrv] Extract captions (#19238) +* [tele5] Improve video id extraction +* [tele5] Relax URL regular expression (#21020, #21063) +* [svtplay] Update API URL (#21075) ++ [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) + + +version 2019.05.11 + +Core +* [utils] Transliterate "þ" as "th" (#20897) + +Extractors ++ [cloudflarestream] Add support for videodelivery.net (#21049) ++ [byutv] Add support for DVR videos (#20574, #20676) ++ [gfycat] Add support for URLs with tags (#20696, #20731) ++ [openload] Add support for verystream.com (#20701, #20967) +* [youtube] Use sp field value for signature field name (#18841, #18927, + #21028) ++ [yahoo:gyao] Extend URL regular expression (#21008) +* [youtube] Fix channel id extraction (#20982, #21003) ++ [sky] Add support for news.sky.com (#13055) ++ [youtube:entrylistbase] Retry on 5xx HTTP errors (#20965) ++ [francetvinfo] Extend video id extraction (#20619, #20740) +* [4tube] Update token hosts (#20918) +* [hotstar] Move to API v2 (#20931) +* [fox] Fix API error handling under python 2 (#20925) ++ [redbulltv] Extend URL regular expression (#20922) + + +version 2019.04.30 + +Extractors +* [openload] Use real Chrome versions (#20902) +- [youtube] Remove info el for get_video_info request +* [youtube] Improve extraction robustness +- [dramafever] Remove extractor (#20868) +* [adn] Fix subtitle extraction (#12724) ++ [ccc] Extract creator (#20355) ++ [ccc:playlist] Add support for media.ccc.de playlists (#14601, #20355) ++ [sverigesradio] Add support for sverigesradio.se (#18635) ++ [cinemax] Add support for cinemax.com +* [sixplay] Try extracting non-DRM protected manifests (#20849) ++ [youtube] Extract Youtube Music Auto-generated metadata (#20599, #20742) +- [wrzuta] Remove extractor (#20684, #20801) +* [twitch] Prefer source format (#20850) ++ [twitcasting] Add support for private videos (#20843) +* [reddit] Validate thumbnail URL (#20030) +* [yandexmusic] Fix track URL extraction (#20820) + + +version 2019.04.24 + +Extractors +* [youtube] Fix extraction (#20758, #20759, #20761, #20762, #20764, #20766, + #20767, #20769, #20771, #20768, #20770) +* [toutv] Fix extraction and extract series info (#20757) ++ [vrv] Add support for movie listings (#19229) ++ [youtube] Print error when no data is available (#20737) ++ [soundcloud] Add support for new rendition and improve extraction (#20699) ++ [ooyala] Add support for geo verification proxy ++ [nrl] Add support for nrl.com (#15991) ++ [vimeo] Extract live archive source format (#19144) ++ [vimeo] Add support for live streams and improve info extraction (#19144) ++ [ntvcojp] Add support for cu.ntv.co.jp ++ [nhk] Extract RTMPT format ++ [nhk] Add support for audio URLs ++ [udemy] Add another course id extraction pattern (#20491) ++ [openload] Add support for oload.services (#20691) ++ [openload] Add support for openloed.co (#20691, #20693) +* [bravotv] Fix extraction (#19213) + + +version 2019.04.17 + +Extractors +* [openload] Randomize User-Agent (#20688) ++ [openload] Add support for oladblock domains (#20471) +* [adn] Fix subtitle extraction (#12724) ++ [aol] Add support for localized websites ++ [yahoo] Add support GYAO episode URLs ++ [yahoo] Add support for streaming.yahoo.co.jp (#5811, #7098) ++ [yahoo] Add support for gyao.yahoo.co.jp +* [aenetworks] Fix history topic extraction and extract more formats ++ [cbs] Extract smpte and vtt subtitles ++ [streamango] Add support for streamcherry.com (#20592) ++ [yourporn] Add support for sxyprn.com (#20646) +* [mgtv] Fix extraction (#20650) +* [linkedin:learning] Use urljoin for form action URL (#20431) ++ [gdc] Add support for kaltura embeds (#20575) +* [dispeak] Improve mp4 bitrate extraction +* [kaltura] Sanitize embed URLs +* [jwplatfom] Do not match manifest URLs (#20596) +* [aol] Restrict URL regular expression and improve format extraction ++ [tiktok] Add support for new URL schema (#20573) ++ [stv:player] Add support for player.stv.tv (#20586) + + +version 2019.04.07 + +Core ++ [downloader/external] Pass rtmp_conn to ffmpeg + +Extractors ++ [ruutu] Add support for audio podcasts (#20473, #20545) ++ [xvideos] Extract all thumbnails (#20432) ++ [platzi] Add support for platzi.com (#20562) +* [dvtv] Fix extraction (#18514, #19174) ++ [vrv] Add basic support for individual movie links (#19229) ++ [bfi:player] Add support for player.bfi.org.uk (#19235) +* [hbo] Fix extraction and extract subtitles (#14629, #13709) +* [youtube] Extract srv[1-3] subtitle formats (#20566) +* [adultswim] Fix extraction (#18025) +* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [adn] Fix subtitle compatibility with ffmpeg +* [adn] Fix extraction and add support for positioning styles (#20549) +* [vk] Use unique video id (#17848) +* [newstube] Fix extraction +* [rtl2] Actualize extraction ++ [adobeconnect] Add support for adobeconnect.com (#20283) ++ [gaia] Add support for authentication (#14605) ++ [mediasite] Add support for dashed ids and named catalogs (#20531) + + +version 2019.04.01 + +Core +* [utils] Improve int_or_none and float_or_none (#20403) +* Check for valid --min-sleep-interval when --max-sleep-interval is specified + (#20435) + +Extractors ++ [weibo] Extend URL regular expression (#20496) ++ [xhamster] Add support for xhamster.one (#20508) ++ [mediasite] Add support for catalogs (#20507) ++ [teamtreehouse] Add support for teamtreehouse.com (#9836) ++ [ina] Add support for audio URLs +* [ina] Improve extraction +* [cwtv] Fix episode number extraction (#20461) +* [npo] Improve DRM detection ++ [pornhub] Add support for DASH formats (#20403) +* [svtplay] Update API endpoint (#20430) + + +version 2019.03.18 + +Core +* [extractor/common] Improve HTML5 entries extraction ++ [utils] Introduce parse_bitrate +* [update] Hide update URLs behind redirect +* [extractor/common] Fix url meta field for unfragmented DASH formats (#20346) + +Extractors ++ [yandexvideo] Add extractor +* [openload] Improve embed detection ++ [corus] Add support for bigbrothercanada.ca (#20357) ++ [orf:radio] Extract series (#20012) ++ [cbc:watch] Add support for gem.cbc.ca (#20251, #20359) +- [anysex] Remove extractor (#19279) ++ [ciscolive] Add support for new URL schema (#20320, #20351) ++ [youtube] Add support for invidiou.sh (#20309) +- [anitube] Remove extractor (#20334) +- [ruleporn] Remove extractor (#15344, #20324) +* [npr] Fix extraction (#10793, #13440) +* [biqle] Fix extraction (#11471, #15313) +* [viddler] Modernize +* [moevideo] Fix extraction +* [primesharetv] Remove extractor +* [hypem] Modernize and extract more metadata (#15320) +* [veoh] Fix extraction +* [escapist] Modernize +- [videomega] Remove extractor (#10108) ++ [beeg] Add support for beeg.porn (#20306) +* [vimeo:review] Improve config url extraction and extract original format + (#20305) +* [fox] Detect geo restriction and authentication errors (#20208) + + +version 2019.03.09 + +Core +* [extractor/common] Use compat_etree_Element ++ [compat] Introduce compat_etree_Element +* [extractor/common] Fallback url to base URL for DASH formats +* [extractor/common] Do not fail on invalid data while parsing F4M manifest + in non fatal mode +* [extractor/common] Return MPD manifest as format's url meta field (#20242) +* [utils] Strip #HttpOnly_ prefix from cookies files (#20219) + +Extractors +* [francetv:site] Relax video id regular expression (#20268) +* [toutv] Detect invalid login error +* [toutv] Fix authentication (#20261) ++ [urplay] Extract timestamp (#20235) ++ [openload] Add support for oload.space (#20246) +* [facebook] Improve uploader extraction (#20250) +* [bbc] Use compat_etree_Element +* [crunchyroll] Use compat_etree_Element +* [npo] Improve ISM extraction +* [rai] Improve extraction (#20253) +* [paramountnetwork] Fix mgid extraction (#20241) +* [libsyn] Improve extraction (#20229) ++ [youtube] Add more invidious instances to URL regular expression (#20228) +* [spankbang] Fix extraction (#20023) +* [espn] Extend URL regular expression (#20013) +* [sixplay] Handle videos with empty assets (#20016) ++ [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070) + + +version 2019.03.01 + +Core ++ [downloader/external] Add support for rate limit and retries for wget +* [downloader/external] Fix infinite retries for curl (#19303) + +Extractors +* [npo] Fix extraction (#20084) +* [francetv:site] Extend video id regex (#20029, #20071) ++ [periscope] Extract width and height (#20015) +* [servus] Fix extraction (#19297) +* [bbccouk] Make subtitles non fatal (#19651) +* [metacafe] Fix family filter bypass (#19287) + + +version 2019.02.18 + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + +version 2019.02.08 + +Core +* [utils] Improve JSON-LD regular expression (#18058) +* [YoutubeDL] Fallback to ie_key of matching extractor while making + download archive id when no explicit ie_key is provided (#19022) + +Extractors ++ [malltv] Add support for mall.tv (#18058, #17856) ++ [spankbang:playlist] Add support for playlists (#19145) +* [spankbang] Extend URL regular expression +* [trutv] Fix extraction (#17336) +* [toutv] Fix authentication (#16398, #18700) +* [pornhub] Fix tags and categories extraction (#13720, #19135) +* [pornhd] Fix formats extraction ++ [pornhd] Extract like count (#19123, #19125) +* [radiocanada] Switch to the new media requests (#19115) ++ [teachable] Add support for courses.workitdaily.com (#18871) +- [vporn] Remove extractor (#16276) ++ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) ++ [drtuber] Extract duration (#19078) +* [soundcloud] Fix paged playlists extraction, add support for albums and update client id +* [soundcloud] Update client id +* [drtv] Improve preference (#19079) ++ [openload] Add support for openload.pw and oload.pw (#18930) ++ [openload] Add support for oload.info (#19073) +* [crackle] Authorize media detail request (#16931) + + +version 2019.01.30.1 + +Core +* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) + + +version 2019.01.30 + +Core +* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding + subtitles (#19024, #19042) +* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) + +Extractors +* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) +* [drtv] Improve extraction (#19039) + + Add support for EncryptedUri videos + + Extract more metadata + * Fix subtitles extraction ++ [fox] Add support for locked videos using cookies (#19060) +* [fox] Fix extraction for free videos (#19060) ++ [zattoo] Add support for tv.salt.ch (#19059) + + +version 2019.01.27 + +Core ++ [extractor/common] Extract season in _json_ld +* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection + (#681) + +Extractors +* [vice] Fix extraction for locked videos (#16248) ++ [wakanim] Detect DRM protected videos ++ [wakanim] Add support for wakanim.tv (#14374) +* [usatoday] Fix extraction for videos with custom brightcove partner id + (#18990) +* [drtv] Fix extraction (#18989) +* [nhk] Extend URL regular expression (#18968) +* [go] Fix Adobe Pass requests for Disney Now (#18901) ++ [openload] Add support for oload.club (#18969) + + +version 2019.01.24 + +Core +* [YoutubeDL] Fix negation for string operators in format selection (#18961) + + +version 2019.01.23 + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + +version 2019.01.17 + +Extractors +* [youtube] Extend JS player signature function name regular expressions + (#18890, #18891, #18893) + + +version 2019.01.16 + +Core ++ [test/helper] Add support for maxcount and count collection len checkers +* [downloader/hls] Fix uplynk ad skipping (#18824) +* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) + +Extractors +* [youtube] Skip unsupported adaptive stream type (#18804) ++ [youtube] Extract DASH formats from player response (#18804) +* [funimation] Fix extraction (#14089) +* [skylinewebcams] Fix extraction (#18853) ++ [curiositystream] Add support for non app URLs ++ [bitchute] Check formats (#18833) +* [wistia] Extend URL regular expression (#18823) ++ [playplustv] Add support for playplus.com (#18789) + + +version 2019.01.10 + +Core +* [extractor/common] Use episode name as title in _json_ld ++ [extractor/common] Add support for movies in _json_ld +* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes + (#18765) ++ [utils] Add language codes replaced in 1989 revision of ISO 639 + to ISO639Utils (#18765) + +Extractors +* [youtube] Extract live HLS URL from player response (#18799) ++ [outsidetv] Add support for outsidetv.com (#18774) +* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs ++ [fox] Add support National Geographic (#17985, #15333, #14698) ++ [playplustv] Add support for playplus.tv (#18789) +* [globo] Set GLBID cookie manually (#17346) ++ [gaia] Add support for gaia.com (#14605) +* [youporn] Fix title and description extraction (#18748) ++ [hungama] Add support for hungama.com (#17402, #18771) +* [dtube] Fix extraction (#18741) +* [tvnow] Fix and rework extractors and prepare for a switch to the new API + (#17245, #18499) +* [carambatv:page] Fix extraction (#18739) + + +version 2019.01.02 + +Extractors +* [discovery] Use geo verification headers (#17838) ++ [packtpub] Add support for subscription.packtpub.com (#18718) +* [yourporn] Fix extraction (#18583) ++ [acast:channel] Add support for play.acast.com (#18587) ++ [extractors] Add missing age limits (#18621) ++ [rmcdecouverte] Add support for live stream +* [rmcdecouverte] Bypass geo restriction +* [rmcdecouverte] Update URL regular expression (#18595, 18697) +* [manyvids] Fix extraction (#18604, #18614) +* [bitchute] Fix extraction (#18567) + + +version 2018.12.31 + +Extractors ++ [bbc] Add support for another embed pattern (#18643) ++ [npo:live] Add support for npostart.nl (#18644) +* [beeg] Fix extraction (#18610, #18626) +* [youtube] Unescape HTML for series (#18641) ++ [youtube] Extract more format metadata +* [youtube] Detect DRM protected videos (#1774) +* [youtube] Relax HTML5 player regular expressions (#18465, #18466) +* [youtube] Extend HTML5 player regular expression (#17516) ++ [liveleak] Add support for another embed type and restore original + format extraction ++ [crackle] Extract ISM and HTTP formats ++ [twitter] Pass Referer with card request (#18579) +* [mediasite] Extend URL regular expression (#18558) ++ [lecturio] Add support for lecturio.de (#18562) ++ [discovery] Add support for Scripps Networks watch domains (#17947) + + +version 2018.12.17 + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + +version 2018.12.09 + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + +version 2018.12.03 + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + +version 2018.11.23 + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + +version 2018.11.18 + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + +version 2018.11.07 + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + +version 2018.11.03 + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + +version 2018.10.29 + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + +version 2018.10.05 + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + +version 2018.09.26 + +Extractors +* [pluralsight] Fix subtitles extraction (#17671) +* [mediaset] Improve embed support (#17668) ++ [youtube] Add support for invidio.us (#17613) ++ [zattoo] Add support for more zattoo platform sites +* [zattoo] Fix extraction (#17175, #17542) + + +version 2018.09.18 + +Core ++ [extractor/common] Introduce channel meta fields + +Extractors +* [adobepass] Don't pollute default headers dict +* [udemy] Don't pollute default headers dict +* [twitch] Don't pollute default headers dict +* [youtube] Don't pollute default query dict (#17593) +* [crunchyroll] Prefer hardsubless formats and formats in locale language +* [vrv] Make format ids deterministic +* [vimeo] Fix ondemand playlist extraction (#14591) ++ [pornhub] Extract upload date (#17574) ++ [porntube] Extract channel meta fields ++ [vimeo] Extract channel meta fields ++ [youtube] Extract channel meta fields (#9676, #12939) +* [porntube] Fix extraction (#17541) +* [asiancrush] Fix extraction (#15630) ++ [twitch:clips] Extend URL regular expression (#17559) ++ [vzaar] Add support for HLS +* [tube8] Fix metadata extraction (#17520) +* [eporner] Extract JSON-LD (#17519) + + +version 2018.09.10 + +Core ++ [utils] Properly recognize AV1 codec (#17506) + +Extractors ++ [iprima] Add support for prima.iprima.cz (#17514) ++ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) +* [nbc] Fix extraction of percent encoded URLs (#17374) + + +version 2018.09.08 + +Extractors +* [youtube] Fix extraction (#17457, #17464) ++ [pornhub:uservideos] Add support for new URLs (#17388) +* [iprima] Confirm adult check (#17437) +* [slideslive] Make check for video service name case-insensitive (#17429) +* [radiojavan] Fix extraction (#17151) +* [generic] Skip unsuccessful jwplayer extraction (#16735) + + +version 2018.09.01 + +Core +* [utils] Skip remote IP addresses non matching to source address' IP version + when creating a connection (#13422, #17362) + +Extractors ++ [ard] Add support for one.ard.de (#17397) +* [niconico] Fix extraction on python3 (#17393, #17407) +* [ard] Extract f4m formats +* [crunchyroll] Parse vilos media data (#17343) ++ [ard] Add support for Beta ARD Mediathek ++ [bandcamp] Extract more metadata (#13197) +* [internazionale] Fix extraction of non-available-abroad videos (#17386) + + +version 2018.08.28 + +Extractors ++ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) + (#17361) +* [bitchute] Fix extraction by pass custom User-Agent (#17360) +* [webofstories:playlist] Fix extraction (#16914) ++ [tvplayhome] Add support for new tvplay URLs (#17344) ++ [generic] Allow relative src for videojs embeds (#17324) ++ [xfileshare] Add support for vidto.se (#17317) ++ [vidzi] Add support for vidzi.nu (#17316) ++ [nova:embed] Add support for media.cms.nova.cz (#17282) + + +version 2018.08.22 + +Core +* [utils] Use pure browser header for User-Agent (#17236) + +Extractors ++ [kinopoisk] Add support for kinopoisk.ru (#17283) ++ [yourporn] Add support for yourporn.sexy (#17298) ++ [go] Add support for disneynow.go.com (#16299, #17264) ++ [6play] Add support for play.rtl.hr (#17249) +* [anvato] Fallback to generic API key for access-key-to-API-key lookup + (#16788, #17254) +* [lci] Fix extraction (#17274) +* [bbccouk] Extend id URL regular expression (#17270) +* [cwtv] Fix extraction (#17256) +* [nova] Fix extraction (#17241) ++ [generic] Add support for expressen embeds +* [raywenderlich] Adapt to site redesign (#17225) ++ [redbulltv] Add support redbull.com tv URLs (#17218) ++ [bitchute] Add support for bitchute.com (#14052) ++ [clyp] Add support for token protected media (#17184) +* [imdb] Fix extension extraction (#17167) + + +version 2018.08.04 + +Extractors +* [funk:channel] Improve byChannelAlias extraction (#17142) +* [twitch] Fix authentication (#17024, #17126) +* [twitch:vod] Improve URL regular expression (#17135) +* [watchbox] Fix extraction (#17107) +* [pbs] Fix extraction (#17109) +* [theplatform] Relax URL regular expression (#16181, #17097) ++ [viqeo] Add support for viqeo.tv (#17066) + + +version 2018.07.29 + +Extractors +* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) ++ [pornhub] Add support for subtitles (#16924, #17088) +* [ceskatelevize] Use https for API call (#16997, #16999) +* [dailymotion:playlist] Fix extraction (#16894) +* [ted] Improve extraction +* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) +* [telecinco] Fix extraction (#17080) +* [mitele] Reduce number of requests +* [rai] Return non HTTP relinker URL intact (#17055) +* [vk] Fix extraction for inline only videos (#16923) +* [streamcloud] Fix extraction (#17054) +* [facebook] Fix tahoe player extraction with authentication (#16655) ++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269) + + +version 2018.07.21 + +Core ++ [utils] Introduce url_or_none +* [utils] Allow JSONP without function name (#17028) ++ [extractor/common] Extract DASH and MSS formats from SMIL manifests + +Extractors ++ [bbc] Add support for BBC Radio Play pages (#17022) +* [iwara] Fix download URLs (#17026) +* [vrtnu] Relax title extraction and extract JSON-LD (#17018) ++ [viu] Pass Referer and Origin headers and area id (#16992) ++ [vimeo] Add another config regular expression (#17013) ++ [facebook] Extract view count (#16942) +* [dailymotion] Improve description extraction (#16984) +* [slutload] Fix and improve extraction (#17001) +* [mediaset] Fix extraction (#16977) ++ [theplatform] Add support for theplatform TLD customization (#16977) +* [imgur] Relax URL regular expression (#16987) +* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, + #16959) + + +version 2018.07.10 + +Core +* [utils] Share JSON-LD regular expression +* [downloader/dash] Improve error handling (#16927) + +Extractors ++ [nrktv] Add support for new season and serie URL schema ++ [nrktv] Add support for new episode URL schema (#16909) ++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328) +* [funk] Fix extraction (#16918) +* [watchbox] Fix extraction (#16904) +* [dplayit] Sort formats +* [dplayit] Fix extraction (#16901) +* [youtube] Improve login error handling (#13822) + + +version 2018.07.04 + +Core +* [extractor/common] Properly escape % in MPD templates (#16867) +* [extractor/common] Use source URL as Referer for HTML5 entries (16849) +* Prefer ffmpeg over avconv by default (#8622) + +Extractors +* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) +* [lynda] Simplify login and improve error capturing (#16891) ++ [go90] Add support for embed URLs (#16873) +* [go90] Detect geo restriction error and pass geo verification headers + (#16874) +* [vlive] Fix live streams extraction (#16871) +* [npo] Fix typo (#16872) ++ [mediaset] Add support for new videos and extract all formats (#16568) +* [dctptv] Restore extraction based on REST API (#16850) +* [svt] Improve extraction and add support for pages (#16802) +* [porncom] Fix extraction (#16808) + + +version 2018.06.25 + +Extractors +* [joj] Relax URL regular expression (#16771) +* [brightcove] Workaround sonyliv DRM protected videos (#16807) +* [motherless] Fix extraction (#16786) +* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) +- [foxnews:insider] Remove extractor (#15810) ++ [foxnews] Add support for iframe embeds (#15810, #16711) + + +version 2018.06.19 + +Core ++ [extractor/common] Introduce expected_status in _download_* methods + for convenient accept of HTTP requests failed with non 2xx status codes ++ [compat] Introduce compat_integer_types + +Extractors +* [peertube] Improve generic support (#16733) ++ [6play] Use geo verification headers +* [rtbf] Fix extraction for python 3.2 +* [vgtv] Improve HLS formats extraction ++ [vgtv] Add support for www.aftonbladet.se/tv URLs +* [bbccouk] Use expected_status +* [markiza] Expect 500 HTTP status code +* [tvnow] Try all clear manifest URLs (#15361) + + +version 2018.06.18 + +Core +* [downloader/rtmp] Fix downloading in verbose mode (#16736) + +Extractors ++ [markiza] Add support for markiza.sk (#16750) +* [wat] Try all supported adaptive URLs ++ [6play] Add support for rtlplay.be and extract hd usp formats ++ [rtbf] Add support for audio and live streams (#9638, #11923) ++ [rtbf] Extract HLS, DASH and all HTTP formats ++ [rtbf] Extract subtitles ++ [rtbf] Fixup specific HTTP URLs (#16101) ++ [expressen] Add support for expressen.se +* [vidzi] Fix extraction (#16678) +* [pbs] Improve extraction (#16623, #16684) +* [bilibili] Restrict cid regular expression (#16638, #16734) + + +version 2018.06.14 + +Core +* [downloader/http] Fix retry on error when streaming to stdout (#16699) + +Extractors ++ [discoverynetworks] Add support for disco-api videos (#16724) ++ [dailymotion] Add support for password protected videos (#9789) ++ [abc:iview] Add support for livestreams (#12354) +* [abc:iview] Fix extraction (#16704) ++ [crackle] Add support for sonycrackle.com (#16698) ++ [tvnet] Add support for tvnet.gov.vn (#15462) +* [nrk] Update API hosts and try all previously known ones (#16690) +* [wimp] Fix Youtube embeds extraction + + +version 2018.06.11 + +Extractors +* [npo] Extend URL regular expression and add support for npostart.nl (#16682) ++ [inc] Add support for another embed schema (#16666) +* [tv4] Fix format extraction (#16650) ++ [nexx] Add support for free cdn (#16538) ++ [pbs] Add another cove id pattern (#15373) ++ [rbmaradio] Add support for 192k format (#16631) + + +version 2018.06.04 + +Extractors ++ [camtube] Add support for camtube.co ++ [twitter:card] Extract guest token (#16609) ++ [chaturbate] Use geo verification headers ++ [bbc] Add support for bbcthree (#16612) +* [youtube] Move metadata extraction after video availability check ++ [youtube] Extract track and artist ++ [safari] Add support for new URL schema (#16614) +* [adn] Fix extraction + + +version 2018.06.02 + +Core +* [utils] Improve determine_ext + +Extractors ++ [facebook] Add support for tahoe player videos (#15441, #16554) +* [cbc] Improve extraction (#16583, #16593) +* [openload] Improve ext extraction (#16595) ++ [twitter:card] Add support for another endpoint (#16586) ++ [openload] Add support for oload.win and oload.download (#16592) +* [audimedia] Fix extraction (#15309) ++ [francetv] Add support for sport.francetvinfo.fr (#15645) +* [mlb] Improve extraction (#16587) +- [nhl] Remove old extractors +* [rbmaradio] Check formats availability (#16585) + + +version 2018.05.30 + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + +version 2018.05.26 + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + +version 2018.05.18 + +Extractors +* [vimeo:likes] Relax URL regular expression and fix single page likes + extraction (#16475) +* [pluralsight] Fix clip id extraction (#16460) ++ [mychannels] Add support for mychannels.com (#15334) +- [moniker] Remove extractor (#15336) +* [pbs] Fix embed data extraction (#16474) ++ [mtv] Add support for paramountnetwork.com and bellator.com (#15418) +* [youtube] Fix hd720 format position +* [dailymotion] Remove fragment part from m3u8 URLs (#8915) +* [3sat] Improve extraction (#15350) + * Extract all formats + * Extract more format metadata + * Improve format sorting + * Use hls native downloader + * Detect and bypass geo-restriction ++ [dtube] Add support for d.tube (#15201) +* [options] Fix typo (#16450) +* [youtube] Improve format filesize extraction (#16453) +* [youtube] Make uploader extraction non fatal (#16444) +* [youtube] Fix extraction for embed restricted live streams (#16433) +* [nbc] Improve info extraction (#16440) +* [twitch:clips] Fix extraction (#16429) +* [redditr] Relax URL regular expression (#16426, #16427) +* [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) ++ [nick] Add support for nickjr.de (#13230) +* [teamcoco] Fix extraction (#16374) + + +version 2018.05.09 + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + +version 2018.05.01 + +Core +* [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) ++ [extractor/common] Extract interaction statistic ++ [utils] Add merge_dicts ++ [extractor/common] Add _download_json_handle + +Extractors +* [kaltura] Improve iframe embeds detection (#16337) ++ [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, + #16335) ++ [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) +* [yandexmusic] Convert release_year to int +* [udemy] Override _download_webpage_handle instead of _download_webpage +* [xiami] Override _download_webpage_handle instead of _download_webpage +* [yandexmusic] Override _download_webpage_handle instead of _download_webpage +* [youtube] Correctly disable polymer on all requests (#16323, #16326) +* [generic] Prefer enclosures over links in RSS feeds (#16189) ++ [redditr] Add support for old.reddit.com URLs (#16274) +* [nrktv] Update API host (#16324) ++ [imdb] Extract all formats (#16249) ++ [vimeo] Extract JSON-LD (#16295) +* [funk:channel] Improve extraction (#16285) + + +version 2018.04.25 + +Core +* [utils] Fix match_str for boolean meta fields ++ [Makefile] Add support for pandoc 2 and disable smart extension (#16251) +* [YoutubeDL] Fix typo in media extension compatibility checker (#16215) + +Extractors ++ [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, + #16250) ++ [twitch] Extract is_live according to status (#16259) +* [pornflip] Relax URL regular expression (#16258) +- [etonline] Remove extractor (#16256) +* [breakcom] Fix extraction (#16254) ++ [youtube] Add ability to authenticate with cookies +* [youtube:feed] Implement lazy playlist extraction (#10184) ++ [svt] Add support for TV channel live streams (#15279, #15809) +* [ccma] Fix video extraction (#15931) +* [rentv] Fix extraction (#15227) ++ [nick] Add support for nickjr.nl (#16230) +* [extremetube] Fix metadata extraction ++ [keezmovies] Add support for generic embeds (#16134, #16154) +* [nexx] Extract new azure URLs (#16223) +* [cbssports] Fix extraction (#16217) +* [kaltura] Improve embeds detection (#16201) +* [instagram:user] Fix extraction (#16119) +* [cbs] Skip DRM asset types (#16104) + + +version 2018.04.16 + +Extractors +* [smotri:broadcast] Fix extraction (#16180) ++ [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) +* [vine:user] Fix extraction (#15514, #16190) +* [pornhub] Relax URL regular expression (#16165) +* [cbc:watch] Re-acquire device token when expired (#16160) ++ [fxnetworks] Add support for https theplatform URLs (#16125, #16157) ++ [instagram:user] Add request signing (#16119) ++ [twitch] Add support for mobile URLs (#16146) + + +version 2018.04.09 + +Core +* [YoutubeDL] Do not save/restore console title while simulate (#16103) +* [extractor/common] Relax JSON-LD context check (#16006) + +Extractors ++ [generic] Add support for tube8 embeds ++ [generic] Add support for share-videos.se embeds (#16089, #16115) +* [odnoklassniki] Extend URL regular expression (#16081) +* [steam] Bypass mature content check (#16113) ++ [acast] Extract more metadata +* [acast] Fix extraction (#16118) +* [instagram:user] Fix extraction (#16119) +* [drtuber] Fix title extraction (#16107, #16108) +* [liveleak] Extend URL regular expression (#16117) ++ [openload] Add support for oload.xyz +* [openload] Relax stream URL regular expression +* [openload] Fix extraction (#16099) ++ [svtplay:series] Add support for season URLs ++ [svtplay:series] Add support for series (#11130, #16059) + + +version 2018.04.03 + +Extractors ++ [tvnow] Add support for shows (#15837) +* [dramafever] Fix authentication (#16067) +* [afreecatv] Use partial view only when necessary (#14450) ++ [afreecatv] Add support for authentication (#14450) ++ [nationalgeographic] Add support for new URL schema (#16001, #16054) +* [xvideos] Fix thumbnail extraction (#15978, #15979) +* [medialaan] Fix vod id (#16038) ++ [openload] Add support for oload.site (#16039) +* [naver] Fix extraction (#16029) +* [dramafever] Partially switch to API v5 (#16026) +* [abc:iview] Unescape title and series meta fields (#15994) +* [videa] Extend URL regular expression (#16003) + + +version 2018.03.26.1 + +Core ++ [downloader/external] Add elapsed time to progress hook (#10876) +* [downloader/external,fragment] Fix download finalization when writing file + to stdout (#10809, #10876, #15799) + +Extractors +* [vrv] Fix extraction on python2 (#15928) +* [afreecatv] Update referrer (#15947) ++ [24video] Add support for 24video.sexy (#15973) +* [crackle] Bypass geo restriction +* [crackle] Fix extraction (#15969) ++ [lenta] Add support for lenta.ru (#15953) ++ [instagram:user] Add pagination (#15934) +* [youku] Update ccode (#15939) +* [libsyn] Adapt to new page structure + + +version 2018.03.20 + +Core +* [extractor/common] Improve thumbnail extraction for HTML5 entries +* Generalize XML manifest processing code and improve XSPF parsing ++ [extractor/common] Add _download_xml_handle ++ [extractor/common] Add support for relative URIs in _parse_xspf (#15794) + +Extractors ++ [7plus] Extract series metadata (#15862, #15906) +* [9now] Bypass geo restriction (#15920) +* [cbs] Skip unavailable assets (#13490, #13506, #15776) ++ [canalc2] Add support for HTML5 videos (#15916, #15919) ++ [ceskatelevize] Add support for iframe embeds (#15918) ++ [prosiebensat1] Add support for galileo.tv (#15894) ++ [generic] Add support for xfileshare embeds (#15879) +* [bilibili] Switch to v2 playurl API +* [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) +* [heise] Improve extraction (#15496, #15784, #15026) +* [instagram] Fix user videos extraction (#15858) + + +version 2018.03.14 + +Extractors +* [soundcloud] Update client id (#15866) ++ [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + + +version 2018.03.10 + +Core +* [downloader/hls] Skip uplynk ad fragments (#15748) + +Extractors +* [pornhub] Don't override session cookies (#15697) ++ [raywenderlich] Add support for videos.raywenderlich.com (#15251) +* [funk] Fix extraction and rework extractors (#15792) +* [nexx] Restore reverse engineered approach ++ [heise] Add support for kaltura embeds (#14961, #15728) ++ [tvnow] Extract series metadata (#15774) +* [ruutu] Continue formats extraction on NOT-USED URLs (#15775) +* [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) +* [vimeo] Modernize login code and improve error messaging +* [archiveorg] Fix extraction (#15770, #15772) ++ [hidive] Add support for hidive.com (#15494) +* [afreecatv] Detect deleted videos +* [afreecatv] Fix extraction (#15755) +* [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) ++ [vidzi] Add support for vidzi.si (#15751) +* [npo] Fix typo + + +version 2018.03.03 + +Core ++ [utils] Add parse_resolution +Revert respect --prefer-insecure while updating + +Extractors ++ [yapfiles] Add support for yapfiles.ru (#15726, #11085) +* [spankbang] Fix formats extraction (#15727) +* [adn] Fix extraction (#15716) ++ [toggle] Extract DASH and ISM formats (#15721) ++ [nickelodeon] Add support for nickelodeon.com.tr (#15706) +* [npo] Validate and filter format URLs (#15709) + + +version 2018.02.26 + +Extractors +* [udemy] Use custom User-Agent (#15571) + + +version 2018.02.25 + +Core +* [postprocessor/embedthumbnail] Skip embedding when there aren't any + thumbnails (#12573) +* [extractor/common] Improve jwplayer subtitles extraction (#15695) + +Extractors ++ [vidlii] Add support for vidlii.com (#14472, #14512, #14779) ++ [streamango] Capture and output error messages +* [streamango] Fix extraction (#14160, #14256) ++ [telequebec] Add support for emissions (#14649, #14655) ++ [telequebec:live] Add support for live streams (#15688) ++ [mailru:music] Add support for mail.ru/music (#15618) +* [aenetworks] Switch to akamai HLS formats (#15612) +* [ytsearch] Fix flat title extraction (#11260, #15681) + + +version 2018.02.22 + +Core ++ [utils] Fixup some common URL typos in sanitize_url (#15649) +* Respect --prefer-insecure while updating (#15497) + +Extractors +* [vidio] Fix HLS URL extraction (#15675) ++ [nexx] Add support for arc.nexx.cloud URLs +* [nexx] Switch to arc API (#15652) +* [redtube] Fix duration extraction (#15659) ++ [sonyliv] Respect referrer (#15648) ++ [brightcove:new] Use referrer for formats' HTTP headers ++ [cbc] Add support for olympics.cbc.ca (#15535) ++ [fusion] Add support for fusion.tv (#15628) +* [npo] Improve quality metadata extraction +* [npo] Relax URL regular expression (#14987, #14994) ++ [npo] Capture and output error message ++ [pornhub] Add support for channels (#15613) +* [youtube] Handle shared URLs with generic extractor (#14303) + + +version 2018.02.11 + +Core ++ [YoutubeDL] Add support for filesize_approx in format selector (#15550) + +Extractors ++ [francetv] Add support for live streams (#13689) ++ [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, + #15012) +* [francetv] Separate main extractor and rework others to delegate to it +* [francetv] Improve manifest URL signing (#15536) ++ [francetv] Sign m3u8 manifest URLs (#15565) ++ [veoh] Add support for embed URLs (#15561) +* [afreecatv] Fix extraction (#15556) +* [periscope] Use accessVideoPublic endpoint (#15554) +* [discovery] Fix auth request (#15542) ++ [6play] Extract subtitles (#15541) +* [newgrounds] Fix metadata extraction (#15531) ++ [nbc] Add support for stream.nbcolympics.com (#10295) +* [dvtv] Fix live streams extraction (#15442) + + +version 2018.02.08 + +Extractors ++ [myvi] Extend URL regular expression ++ [myvi:embed] Add support for myvi.tv embeds (#15521) ++ [prosiebensat1] Extend URL regular expression (#15520) +* [pokemon] Relax URL regular expression and extend title extraction (#15518) ++ [gameinformer] Use geo verification headers +* [la7] Fix extraction (#15501, #15502) +* [gameinformer] Fix brightcove id extraction (#15416) ++ [afreecatv] Pass referrer to video info request (#15507) ++ [telebruxelles] Add support for live streams +* [telebruxelles] Relax URL regular expression +* [telebruxelles] Fix extraction (#15504) +* [extractor/common] Respect secure schemes in _extract_wowza_formats + + +version 2018.02.04 + +Core +* [downloader/http] Randomize HTTP chunk size ++ [downloader/http] Add ability to pass downloader options via info dict +* [downloader/http] Fix 302 infinite loops by not reusing requests ++ Document http_chunk_size + +Extractors ++ [brightcove] Pass embed page URL as referrer (#15486) ++ [youtube] Enforce using chunked HTTP downloading for DASH formats + + +version 2018.02.03 + +Core ++ Introduce --http-chunk-size for chunk-based HTTP downloading ++ Add support for IronPython +* [downloader/ism] Fix Python 3.2 support + +Extractors +* [redbulltv] Fix extraction (#15481) +* [redtube] Fix metadata extraction (#15472) +* [pladform] Respect platform id and extract HLS formats (#15468) +- [rtlnl] Remove progressive formats (#15459) +* [6play] Do no modify asset URLs with a token (#15248) +* [nationalgeographic] Relax URL regular expression +* [dplay] Relax URL regular expression (#15458) +* [cbsinteractive] Fix data extraction (#15451) ++ [amcnetworks] Add support for sundancetv.com (#9260) + + +version 2018.01.27 + +Core +* [extractor/common] Improve _json_ld for articles +* Switch codebase to use compat_b64decode ++ [compat] Add compat_b64decode + +Extractors ++ [seznamzpravy] Add support for seznam.cz and seznamzpravy.cz (#14102, #14616) +* [dplay] Bypass geo restriction ++ [dplay] Add support for disco-api videos (#15396) +* [youtube] Extract precise error messages (#15284) +* [teachertube] Capture and output error message +* [teachertube] Fix and relax thumbnail extraction (#15403) ++ [prosiebensat1] Add another clip id regular expression (#15378) +* [tbs] Update tokenizer url (#15395) +* [mixcloud] Use compat_b64decode (#15394) +- [thesixtyone] Remove extractor (#15341) + + +version 2018.01.21 + +Core +* [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) +* [utils] Improve scientific notation handling in js_to_json (#14789) + +Extractors ++ [southparkdk] Add support for southparkstudios.nu ++ [southpark] Add support for collections (#14803) +* [franceinter] Fix upload date extraction (#14996) ++ [rtvs] Add support for rtvs.sk (#9242, #15187) +* [restudy] Fix extraction and extend URL regular expression (#15347) +* [youtube:live] Improve live detection (#15365) ++ [springboardplatform] Add support for springboardplatform.com +* [prosiebensat1] Add another clip id regular expression (#15290) +- [ringtv] Remove extractor (#15345) + + +version 2018.01.18 + +Extractors +* [soundcloud] Update client id (#15306) +- [kamcord] Remove extractor (#15322) ++ [spiegel] Add support for nexx videos (#15285) +* [twitch] Fix authentication and error capture (#14090, #15264) +* [vk] Detect more errors due to copyright complaints (#15259) + + +version 2018.01.14 + +Extractors +* [youtube] Fix live streams extraction (#15202) +* [wdr] Bypass geo restriction +* [wdr] Rework extractors (#14598) ++ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) ++ [gamestar] Add support for gamepro.de (#3384) +* [viafree] Skip rtmp formats (#15232) ++ [pandoratv] Add support for mobile URLs (#12441) ++ [pandoratv] Add support for new URL format (#15131) ++ [ximalaya] Add support for ximalaya.com (#14687) ++ [digg] Add support for digg.com (#15214) +* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) +* [ndr:embed:base] Make separate formats extraction non fatal (#15203) ++ [weibo] Add extractor (#15079) ++ [ok] Add support for live streams +* [canalplus] Fix extraction (#15072) +* [bilibili] Fix extraction (#15188) + + +version 2018.01.07 + +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + +Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) +* [youku] Fix list extraction (#15135) +* [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) +* [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) + + +version 2017.12.31 + +Core ++ [extractor/common] Add container meta field for formats extracted + in _parse_mpd_formats (#13616) ++ [downloader/hls] Use HTTP headers for key request +* [common] Use AACL as the default fourcc when AudioTag is 255 +* [extractor/common] Fix extraction of DASH formats with the same + representation id (#15111) + +Extractors ++ [slutload] Add support for mobile URLs (#14806) +* [abc:iview] Bypass geo restriction +* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, + #15035, #15057, #15061, #15071, #15095, #15106) +* [openload] Fix extraction (#15118) +- [sandia] Remove extractor +- [collegerama] Remove extractor ++ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, + #11185, #14343) ++ [ufctv] Add support for ufc.tv (#14520) +* [pluralsight] Fix missing first line of subtitles (#11118) +* [openload] Fallback on f-page extraction (#14665, #14879) +* [vimeo] Improve password protected videos extraction (#15114) +* [aws] Fix canonical/signed headers generation on python 2 (#15102) + + +version 2017.12.28 + +Extractors ++ [internazionale] Add support for internazionale.it (#14973) +* [playtvak] Relax video regular expression and make description optional + (#15037) ++ [filmweb] Add support for filmweb.no (#8773, #10368) ++ [23video] Add support for 23video.com ++ [espn] Add support for fivethirtyeight.com (#6864) ++ [umg:de] Add support for universal-music.de (#11582, #11584) ++ [espn] Add support for espnfc and extract more formats (#8053) +* [youku] Update ccode (#14880) ++ [openload] Add support for oload.stream (#15070) +* [youku] Fix list extraction (#15065) + + +version 2017.12.23 + +Core +* [extractor/common] Move X-Forwarded-For setup code into _request_webpage ++ [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in + output template (#11427, #15018) ++ [extractor/common] Introduce uploader, uploader_id and uploader_url + meta fields for playlists (#11427, #15018) +* [downloader/fragment] Encode filename of fragment being removed (#15020) ++ [utils] Add another date format pattern (#14999) + +Extractors ++ [kaltura] Add another embed pattern for entry_id ++ [7plus] Add support for 7plus.com.au (#15043) +* [animeondemand] Relax login error regular expression ++ [shahid] Add support for show pages (#7401) ++ [youtube] Extract uploader, uploader_id and uploader_url for playlists + (#11427, #15018) +* [afreecatv] Improve format extraction (#15019) ++ [cspan] Add support for audio only pages and catch page errors (#14995) ++ [mailru] Add support for embed URLs (#14904) +* [crunchyroll] Future-proof XML element checks (#15013) +* [cbslocal] Fix timestamp extraction (#14999, #15000) +* [discoverygo] Correct TTML subtitle extension +* [vk] Make view count optional (#14979) +* [disney] Skip Apple FairPlay formats (#14982) +* [voot] Fix format extraction (#14758) + + +version 2017.12.14 + +Core +* [postprocessor/xattr] Clarify NO_SPACE message (#14970) +* [downloader/http] Return actual download result from real_download (#14971) + +Extractors ++ [itv] Extract more subtitles and duration +* [itv] Improve extraction (#14944) ++ [byutv] Add support for geo restricted videos +* [byutv] Fix extraction (#14966, #14967) ++ [bbccouk] Fix extraction for 320k HLS streams ++ [toutv] Add support for special video URLs (#14179) +* [discovery] Fix free videos extraction (#14157, #14954) +* [tvnow] Fix extraction (#7831) ++ [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) +* [nick] Improve extraction (#14876) +* [tbs] Fix extraction (#13658) + + +version 2017.12.10 + +Core ++ [utils] Add sami mimetype to mimetype2ext + +Extractors +* [culturebox] Improve video id extraction (#14947) +* [twitter] Improve extraction (#14197) ++ [udemy] Extract more HLS formats +* [udemy] Improve course id extraction (#14938) ++ [stretchinternet] Add support for portal.stretchinternet.com (#14576) +* [ellentube] Fix extraction (#14407, #14570) ++ [raiplay:playlist] Add support for playlists (#14563) +* [sonyliv] Bypass geo restriction +* [sonyliv] Extract higher quality formats (#14922) +* [fox] Extract subtitles ++ [fox] Add support for Adobe Pass authentication (#14205, #14489) +- [dailymotion:cloud] Remove extractor (#6794) +* [xhamster] Fix thumbnail extraction (#14780) ++ [xhamster] Add support for mobile URLs (#14780) +* [generic] Don't pass video id as mpd id while extracting DASH (#14902) +* [ard] Skip invalid stream URLs (#14906) +* [porncom] Fix metadata extraction (#14911) +* [pluralsight] Detect agreement request (#14913) +* [toutv] Fix login (#14614) + + +version 2017.12.02 + +Core ++ [downloader/fragment] Commit part file after each fragment ++ [extractor/common] Add durations for DASH fragments with bare SegmentURLs ++ [extractor/common] Add support for DASH manifests with SegmentLists with + bare SegmentURLs (#14844) ++ [utils] Add hvc1 codec code to parse_codecs + +Extractors +* [xhamster] Fix extraction (#14884) +* [youku] Update ccode (#14872) +* [mnet] Fix format extraction (#14883) ++ [xiami] Add Referer header to API request +* [mtv] Correct scc extention in extracted subtitles (#13730) +* [vvvvid] Fix extraction for kenc videos (#13406) ++ [br] Add support for BR Mediathek videos (#14560, #14788) ++ [daisuki] Add support for motto.daisuki.com (#14681) +* [odnoklassniki] Fix API metadata request (#14862) +* [itv] Fix HLS formats extraction ++ [pbs] Add another media id regular expression + + +version 2017.11.26 + +Core +* [extractor/common] Use final URL when dumping request (#14769) + +Extractors +* [fczenit] Fix extraction +- [firstpost] Remove extractor +* [freespeech] Fix extraction +* [nexx] Extract more formats ++ [openload] Add support for openload.link (#14763) +* [empflix] Relax URL regular expression +* [empflix] Fix extractrion +* [tnaflix] Don't modify download URLs (#14811) +- [gamersyde] Remove extractor +* [francetv:generationwhat] Fix extraction ++ [massengeschmacktv] Add support for Massengeschmack TV +* [fox9] Fix extraction +* [faz] Fix extraction and add support for Perform Group embeds (#14714) ++ [performgroup] Add support for performgroup.com ++ [jwplatform] Add support for iframes (#14828) +* [culturebox] Fix extraction (#14827) +* [youku] Fix extraction; update ccode (#14815) +* [livestream] Make SMIL extraction non fatal (#14792) ++ [drtuber] Add support for mobile URLs (#14772) ++ [spankbang] Add support for mobile URLs (#14771) +* [instagram] Fix description, timestamp and counters extraction (#14755) + + +version 2017.11.15 + +Core +* [common] Skip Apple FairPlay m3u8 manifests (#14741) +* [YoutubeDL] Fix playlist range optimization for --playlist-items (#14740) + +Extractors +* [vshare] Capture and output error message +* [vshare] Fix extraction (#14473) +* [crunchyroll] Extract old RTMP formats +* [tva] Fix extraction (#14736) +* [gamespot] Lower preference of HTTP formats (#14652) +* [instagram:user] Fix extraction (#14699) +* [ccma] Fix typo (#14730) +- Remove sensitive data from logging in messages +* [instagram:user] Fix extraction (#14699) ++ [gamespot] Add support for article URLs (#14652) +* [gamespot] Skip Brightcove Once HTTP formats (#14652) +* [cartoonnetwork] Update tokenizer_src (#14666) ++ [wsj] Recognize another URL pattern (#14704) +* [pandatv] Update API URL and sign format URLs (#14693) +* [crunchyroll] Use old login method (#11572) + + +version 2017.11.06 + +Core ++ [extractor/common] Add protocol for f4m formats +* [f4m] Prefer baseURL for relative URLs (#14660) +* [extractor/common] Respect URL query in _extract_wowza_formats (14645) + +Extractors ++ [hotstar:playlist] Add support for playlists (#12465) +* [hotstar] Bypass geo restriction (#14672) +- [22tracks] Remove extractor (#11024, #14628) ++ [skysport] Sdd support ooyala videos protected with embed_token (#14641) +* [gamespot] Extract formats referenced with new data fields (#14652) +* [spankbang] Detect unavailable videos (#14644) + + +version 2017.10.29 + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regular expression (#14600) +* [vimeo] Restrict iframe embed regular expression (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + +version 2017.10.20 + +Core +* [downloader/fragment] Report warning instead of error on inconsistent + download state +* [downloader/hls] Fix total fragments count when ad fragments exist + +Extractors +* [parliamentliveuk] Fix extraction (#14524) +* [soundcloud] Update client id (#14546) ++ [servus] Add support for servus.com (#14362) ++ [unity] Add support for unity3d.com (#14528) +* [youtube] Replace youtube redirect URLs in description (#14517) +* [pbs] Restrict direct video URL regular expression (#14519) +* [drtv] Respect preference for direct HTTP formats (#14509) ++ [eporner] Add support for embed URLs (#14507) +* [arte] Capture and output error message +* [niconico] Improve uploader metadata extraction robustness (#14135) + + +version 2017.10.15.1 + +Core +* [downloader/hls] Ignore anvato ad fragments (#14496) +* [downloader/fragment] Output ad fragment count + +Extractors +* [scrippsnetworks:watch] Bypass geo restriction ++ [anvato] Add ability to bypass geo restriction +* [redditr] Fix extraction for URLs with query (#14495) + + +version 2017.10.15 + +Core ++ [common] Add support for jwplayer youtube embeds + +Extractors +* [scrippsnetworks:watch] Fix extraction (#14389) +* [anvato] Process master m3u8 manifests +* [youtube] Fix relative URLs in description +* [spike] Bypass geo restriction ++ [howstuffworks] Add support for more domains +* [infoq] Fix http format downloading ++ [rtlnl] Add support for another type of embeds ++ [onionstudios] Add support for bulbs-video embeds +* [udn] Fix extraction +* [shahid] Fix extraction (#14448) +* [kaltura] Ignore Widevine encrypted video (.wvm) (#14471) +* [vh1] Fix extraction (#9613) + + +version 2017.10.12 + +Core +* [YoutubeDL] Improve _default_format_spec (#14461) + +Extractors +* [steam] Fix extraction (#14067) ++ [funk] Add support for funk.net (#14464) ++ [nexx] Add support for shortcuts and relax domain id extraction ++ [voxmedia] Add support for recode.net (#14173) ++ [once] Add support for vmap URLs ++ [generic] Add support for channel9 embeds (#14469) +* [tva] Fix extraction (#14328) ++ [tubitv] Add support for new URL format (#14460) +- [afreecatv:global] Remove extractor +- [youtube:shared] Removed extractor (#14420) ++ [slideslive] Add support for slideslive.com (#2680) ++ [facebook] Support thumbnails (#14416) +* [vvvvid] Fix episode number extraction (#14456) +* [hrti:playlist] Relax URL regular expression +* [wdr] Relax media link regular expression (#14447) +* [hrti] Relax URL regular expression (#14443) +* [fox] Delegate extraction to uplynk:preplay (#14147) ++ [youtube] Add support for hooktube.com (#14437) + + +version 2017.10.07 + +Core +* [YoutubeDL] Ignore duplicates in --playlist-items +* [YoutubeDL] Fix out of range --playlist-items for iterable playlists and + reduce code duplication (#14425) ++ [utils] Use cache in OnDemandPagedList by default +* [postprocessor/ffmpeg] Convert to opus using libopus (#14381) + +Extractors +* [reddit] Sort formats (#14430) +* [lnkgo] Relax URL regular expression (#14423) +* [pornflip] Extend URL regular expression (#14405, #14406) ++ [xtube] Add support for embed URLs (#14417) ++ [xvideos] Add support for embed URLs and improve extraction (#14409) +* [beeg] Fix extraction (#14403) +* [tvn24] Relax URL regular expression (#14395) +* [nbc] Fix extraction (#13651, #13715, #14137, #14198, #14312, #14314, #14378, + #14392, #14414, #14419, #14431) ++ [ketnet] Add support for videos without direct sources (#14377) +* [canvas] Generalize mediazone.vrt.be extractor and rework canvas and een ++ [afreecatv] Add support for adult videos (#14376) + + +version 2017.10.01 + +Core +* [YoutubeDL] Document youtube_include_dash_manifest + +Extractors ++ [tvp] Add support for new URL schema (#14368) ++ [generic] Add support for single format Video.js embeds (#14371) +* [yahoo] Bypass geo restriction for brightcove (#14210) +* [yahoo] Use extracted brightcove account id (#14210) +* [rtve:alacarta] Fix extraction (#14290) ++ [yahoo] Add support for custom brigthcove embeds (#14210) ++ [generic] Add support for Video.js embeds ++ [gfycat] Add support for /gifs/detail URLs (#14322) +* [generic] Fix infinite recursion for twitter:player URLs (#14339) +* [xhamsterembed] Fix extraction (#14308) + + +version 2017.09.24 + +Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) +* [utils] Fix handling raw TTML subtitles (#14191) + +Extractors +* [24video] Fix timestamp extraction and make non fatal (#14295) ++ [24video] Add support for 24video.adult (#14295) ++ [kakao] Add support for tv.kakao.com (#12298, #14007) ++ [twitter] Add support for URLs without user id (#14270) ++ [americastestkitchen] Add support for americastestkitchen.com (#10764, + #13996) +* [generic] Fix support for multiple HTML5 videos on one page (#14080) +* [mixcloud] Fix extraction (#14088, #14132) ++ [lynda] Add support for educourse.ga (#14286) +* [beeg] Fix extraction (#14275) +* [nbcsports:vplayer] Correct theplatform URL (#13873) +* [twitter] Fix duration extraction (#14141) +* [tvplay] Bypass geo restriction ++ [heise] Add support for YouTube embeds (#14109) ++ [popcorntv] Add support for popcorntv.it (#5914, #14211) +* [viki] Update app data (#14181) +* [morningstar] Relax URL regular expression (#14222) +* [openload] Fix extraction (#14225, #14257) +* [noovo] Fix extraction (#14214) +* [dailymotion:playlist] Relax URL regular expression (#14219) ++ [twitch] Add support for go.twitch.tv URLs (#14215) +* [vgtv] Relax URL regular expression (#14223) + + +version 2017.09.15 + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + +version 2017.09.11 + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + +version 2017.09.10 + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + +version 2017.09.02 + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + +version 2017.08.27.1 + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + +version 2017.08.27 + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + +version 2017.08.23 + +Core ++ [extractor/common] Introduce _parse_xml +* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries + non fatal (#13970) +* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) + +Extractors +* [cbc:watch] Bypass geo restriction (#13993) +* [toutv] Relax DRM check (#13994) ++ [googledrive] Add support for subtitles (#13619, #13638) +* [pornhub] Relax uploader regular expression (#13906, #13975) +* [bandcamp:album] Extract track titles (#13962) ++ [bbccouk] Add support for events URLs (#13893) ++ [liveleak] Support multi-video pages (#6542) ++ [liveleak] Support another liveleak embedding pattern (#13336) +* [cda] Fix extraction (#13935) ++ [laola1tv] Add support for tv.ittf.com (#13965) +* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) + + +version 2017.08.18 + +Core +* [YoutubeDL] Sanitize byte string format URLs (#13951) ++ [extractor/common] Add support for float durations in _parse_mpd_formats + (#13919) + +Extractors +* [arte] Detect unavailable videos (#13945) +* [generic] Convert redirect URLs to unicode strings (#13951) +* [udemy] Fix paid course detection (#13943) +* [pluralsight] Use RPC API for course extraction (#13937) ++ [clippit] Add support for clippituser.tv ++ [qqmusic] Support new URL schemes (#13805) +* [periscope] Renew HLS extraction (#13917) +* [mixcloud] Extract decrypt key + + +version 2017.08.13 + +Core +* [YoutubeDL] Make sure format id is not empty +* [extractor/common] Make _family_friendly_search optional +* [extractor/common] Respect source's type attribute for HTML5 media (#13892) + +Extractors +* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) ++ [fourtube] Add support pornerbros.com (#6022) ++ [fourtube] Add support porntube.com (#7859, #13901) ++ [fourtube] Add support fux.com +* [limelight] Improve embeds detection (#13895) ++ [reddit] Add support for v.redd.it and reddit.com (#13847) +* [aparat] Extract all formats (#13887) +* [mixcloud] Fix play info decryption (#13885) ++ [generic] Add support for vzaar embeds (#13876) + + +version 2017.08.09 + +Core +* [utils] Skip missing params in cli_bool_option (#13865) + +Extractors +* [xxxymovies] Fix title extraction (#13868) ++ [nick] Add support for nick.com.pl (#13860) +* [mixcloud] Fix play info decryption (#13867) +* [20min] Fix embeds extraction (#13852) +* [dplayit] Fix extraction (#13851) ++ [niconico] Support videos with multiple formats (#13522) ++ [niconico] Support HTML5-only videos (#13806) + + +version 2017.08.06 + +Core +* Use relative paths for DASH fragments (#12990) + +Extractors +* [pluralsight] Fix format selection +- [mpora] Remove extractor (#13826) ++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) +* [vlive:channel] Limit number of videos per page to 100 (#13830) +* [podomatic] Extend URL regular expression (#13827) +* [cinchcast] Extend URL regular expression +* [yandexdisk] Relax URL regular expression (#13824) +* [vidme] Extract DASH and HLS formats +- [teamfour] Remove extractor (#13782) +* [pornhd] Fix extraction (#13783) +* [udemy] Fix subtitles extraction (#13812) +* [mlb] Extend URL regular expression (#13740, #13773) ++ [pbs] Add support for new URL schema (#13801) +* [nrktv] Update API host (#13796) + + +version 2017.07.30.1 + +Core +* [downloader/hls] Use redirect URL as manifest base (#13755) +* [options] Correctly hide login info from debug outputs (#13696) + +Extractors ++ [watchbox] Add support for watchbox.de (#13739) +- [clipfish] Remove extractor ++ [youjizz] Fix extraction (#13744) ++ [generic] Add support for another ooyala embed pattern (#13727) ++ [ard] Add support for lives (#13771) +* [soundcloud] Update client id ++ [soundcloud:trackstation] Add support for track stations (#13733) +* [svtplay] Use geo verification proxy for API request +* [svtplay] Update API URL (#13767) ++ [yandexdisk] Add support for yadi.sk (#13755) ++ [megaphone] Add support for megaphone.fm +* [amcnetworks] Make rating optional (#12453) +* [cloudy] Fix extraction (#13737) ++ [nickru] Add support for nickelodeon.ru +* [mtv] Improve thumbnal extraction +* [nick] Automate geo-restriction bypass (#13711) +* [niconico] Improve error reporting (#13696) + + +version 2017.07.23 + +Core +* [YoutubeDL] Improve default format specification (#13704) +* [YoutubeDL] Do not override id, extractor and extractor_key for + url_transparent entities +* [extractor/common] Fix playlist_from_matches + +Extractors +* [itv] Fix production id extraction (#13671, #13703) +* [vidio] Make duration non fatal and fix typo +* [mtv] Skip missing video parts (#13690) +* [sportbox:embed] Fix extraction ++ [npo] Add support for npo3.nl URLs (#13695) +* [dramafever] Remove video id from title (#13699) ++ [egghead:lesson] Add support for lessons (#6635) +* [funnyordie] Extract more metadata (#13677) +* [youku:show] Fix playlist extraction (#13248) ++ [dispeak] Recognize sevt subdomain (#13276) +* [adn] Improve error reporting (#13663) +* [crunchyroll] Relax series and season regular expression (#13659) ++ [spiegel:article] Add support for nexx iframe embeds (#13029) ++ [nexx:embed] Add support for iframe embeds +* [nexx] Improve JS embed extraction ++ [pearvideo] Add support for pearvideo.com (#13031) + + +version 2017.07.15 + +Core +* [YoutubeDL] Don't expand environment variables in meta fields (#13637) + +Extractors +* [spiegeltv] Delegate extraction to nexx extractor (#13159) ++ [nexx] Add support for nexx.cloud (#10807, #13465) +* [generic] Fix rutube embeds extraction (#13641) +* [karrierevideos] Fix title extraction (#13641) +* [youtube] Don't capture YouTube Red ad for creator meta field (#13621) +* [slideshare] Fix extraction (#13617) ++ [5tv] Add another video URL pattern (#13354, #13606) +* [drtv] Make HLS and HDS extraction non fatal +* [ted] Fix subtitles extraction (#13628, #13629) +* [vine] Make sure the title won't be empty ++ [twitter] Support HLS streams in vmap URLs ++ [periscope] Support pscp.tv URLs in embedded frames +* [twitter] Extract mp4 urls via mobile API (#12726) +* [niconico] Fix authentication error handling (#12486) +* [giantbomb] Extract m3u8 formats (#13626) ++ [vlive:playlist] Add support for playlists (#13613) + + +version 2017.07.09 + +Core ++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries ++ [utils] Support attributes with no values in get_elements_by_attribute + +Extractors ++ [dailymail] Add support for embeds ++ [joj] Add support for joj.sk (#13268) +* [abc.net.au:iview] Extract more formats (#13492, #13489) +* [egghead:course] Fix extraction (#6635, #13370) ++ [cjsw] Add support for cjsw.com (#13525) ++ [eagleplatform] Add support for referrer protected videos (#13557) ++ [eagleplatform] Add support for another embed pattern (#13557) +* [veoh] Extend URL regular expression (#13601) +* [npo:live] Fix live stream id extraction (#13568, #13605) +* [googledrive] Fix height extraction (#13603) ++ [dailymotion] Add support for new layout (#13580) +- [yam] Remove extractor +* [xhamster] Extract all formats and fix duration extraction (#13593) ++ [xhamster] Add support for new URL schema (#13593) +* [espn] Extend URL regular expression (#13244, #13549) +* [kaltura] Fix typo in subtitles extraction (#13569) +* [vier] Adapt extraction to redesign (#13575) + + +version 2017.07.02 + +Core +* [extractor/common] Improve _json_ld + +Extractors ++ [thisoldhouse] Add more fallbacks for video id +* [thisoldhouse] Fix video id extraction (#13540, #13541) +* [xfileshare] Extend format regular expression (#13536) +* [ted] Fix extraction (#13535) ++ [tastytrade] Add support for tastytrade.com (#13521) +* [dplayit] Relax video id regular expression (#13524) ++ [generic] Extract more generic metadata (#13527) ++ [bbccouk] Capture and output error message (#13501, #13518) +* [cbsnews] Relax video info regular expression (#13284, #13503) ++ [facebook] Add support for plugin video embeds and multiple embeds (#13493) +* [soundcloud] Switch to https for API requests (#13502) +* [pandatv] Switch to https for API and download URLs ++ [pandatv] Add support for https URLs (#13491) ++ [niconico] Support sp subdomain (#13494) + + +version 2017.06.25 + +Core ++ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472) +* [YoutubeDL] Skip malformed formats for better extraction robustness + +Extractors ++ [wsj] Add support for barrons.com (#13470) ++ [ign] Add another video id pattern (#13328) ++ [raiplay:live] Add support for live streams (#13414) ++ [redbulltv] Add support for live videos and segments (#13486) ++ [onetpl] Add support for videos embedded via pulsembed (#13482) +* [ooyala] Make more robust +* [ooyala] Skip empty format URLs (#13471, #13476) +* [hgtv.com:show] Fix typo + + +version 2017.06.23 + +Core +* [adobepass] Fix extraction on older python 2.6 + +Extractors +* [youtube] Adapt to new automatic captions rendition (#13467) +* [hgtv.com:show] Relax video config regular expression (#13279, #13461) +* [drtuber] Fix formats extraction (#12058) +* [youporn] Fix upload date extraction +* [youporn] Improve formats extraction +* [youporn] Fix title extraction (#13456) +* [googledrive] Fix formats sorting (#13443) +* [watchindianporn] Fix extraction (#13411, #13415) ++ [vimeo] Add fallback mp4 extension for original format ++ [ruv] Add support for ruv.is (#13396) +* [viu] Fix extraction on older python 2.6 +* [pandora.tv] Fix upload_date extraction (#12846) ++ [asiancrush] Add support for asiancrush.com (#13420) + + +version 2017.06.18 + +Core +* [downloader/common] Use utils.shell_quote for debug command line +* [utils] Use compat_shlex_quote in shell_quote +* [postprocessor/execafterdownload] Encode command line (#13407) +* [compat] Fix compat_shlex_quote on Windows (#5889, #10254) +* [postprocessor/metadatafromtitle] Fix missing optional meta fields processing + in --metadata-from-title (#13408) +* [extractor/common] Fix json dumping with --geo-bypass ++ [extractor/common] Improve jwplayer subtitles extraction ++ [extractor/common] Improve jwplayer formats extraction (#13379) + +Extractors +* [polskieradio] Fix extraction (#13392) ++ [xfileshare] Add support for fastvideo.me (#13385) +* [bilibili] Fix extraction of videos with double quotes in titles (#13387) +* [4tube] Fix extraction (#13381, #13382) ++ [disney] Add support for disneychannel.de (#13383) +* [npo] Improve URL regular expression (#13376) ++ [corus] Add support for showcase.ca ++ [corus] Add support for history.ca (#13359) + + +version 2017.06.12 + +Core +* [utils] Handle compat_HTMLParseError in extract_attributes (#13349) ++ [compat] Introduce compat_HTMLParseError +* [utils] Improve unified_timestamp +* [extractor/generic] Ensure format id is unicode string +* [extractor/common] Return unicode string from _match_id ++ [YoutubeDL] Sanitize more fields (#13313) + +Extractors ++ [xfileshare] Add support for rapidvideo.tv (#13348) +* [xfileshare] Modernize and pass Referer ++ [rutv] Add support for testplayer.vgtrk.com (#13347) ++ [newgrounds] Extract more metadata (#13232) ++ [newgrounds:playlist] Add support for playlists (#10611) +* [newgrounds] Improve formats and uploader extraction (#13346) +* [msn] Fix formats extraction +* [turbo] Ensure format id is string +* [sexu] Ensure height is int +* [jove] Ensure comment count is int +* [golem] Ensure format id is string +* [gfycat] Ensure filesize is int +* [foxgay] Ensure height is int +* [flickr] Ensure format id is string +* [sohu] Fix numeric fields +* [safari] Improve authentication detection (#13319) +* [liveleak] Ensure height is int (#13313) +* [streamango] Make title optional (#13292) +* [rtlnl] Improve URL regular expression (#13295) +* [tvplayer] Fix extraction (#13291) + + +version 2017.06.05 + +Core +* [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270) + +Extractors ++ [bandcamp:weekly] Add support for bandcamp weekly (#12758) +* [pornhub:playlist] Fix extraction (#13281) +- [godtv] Remove extractor (#13175) +* [safari] Fix typo (#13252) +* [youtube] Improve chapters extraction (#13247) +* [1tv] Lower preference for HTTP formats (#13246) +* [francetv] Relax URL regular expression +* [drbonanza] Fix extraction (#13231) +* [packtpub] Fix authentication (#13240) + + +version 2017.05.29 + +Extractors +* [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs + (#13211) +* [xhamster] Fix uploader and like/dislike count extraction (#13216)) ++ [xhamster] Extract categories (#11728) ++ [abcnews] Add support for embed URLs (#12851) +* [gaskrank] Fix extraction (#12493) +* [medialaan] Fix videos with missing videoUrl (#12774) +* [dvtv] Fix playlist support ++ [dvtv] Add support for DASH and HLS formats (#3063) ++ [beam:vod] Add support for beam.pro/mixer.com VODs (#13032)) +* [cbsinteractive] Relax URL regular expression (#13213) +* [adn] Fix formats extraction ++ [youku] Extract more metadata (#10433) +* [cbsnews] Fix extraction (#13205) + + +version 2017.05.26 + +Core ++ [utils] strip_jsonp() can recognize more patterns +* [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182) + +Extractors ++ [youtube] DASH MPDs with cipher signatures are recognized now (#11381) ++ [bbc] Add support for authentication +* [tudou] Merge into youku extractor (#12214) +* [youku:show] Fix extraction +* [youku] Fix extraction (#13191) +* [udemy] Fix extraction for outputs' format entries without URL (#13192) +* [vimeo] Fix formats' sorting (#13189) +* [cbsnews] Fix extraction for 60 Minutes videos (#12861) + + +version 2017.05.23 + +Core ++ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) ++ [adobepass] Add support for Bright House Networks (#13149) + +Extractors ++ [streamcz] Add support for subtitles (#13174) +* [youtube] Fix DASH manifest signature decryption (#8944, #13156) +* [toggle] Relax URL regular expression (#13172) +* [toypics] Fix extraction (#13077) +* [njpwworld] Fix extraction (#13162, #13169) ++ [hitbox] Add support for smashcast.tv (#13154) +* [mitele] Update app key regular expression (#13158) + + +version 2017.05.18.1 + +Core +* [jsinterp] Fix typo and cleanup regular expressions (#13134) + + +version 2017.05.18 + +Core ++ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125, + #13126, #13128, #13129, #13130, #13131, #13132) ++ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats + (#13088, #13092) ++ [utils] Recognize more audio codecs (#13081) + +Extractors ++ [vier] Extract more metadata (#12539) +* [vier] Improve extraction (#12801) + + Add support for authentication + * Bypass authentication when no credentials provided + * Improve extraction robustness +* [dailymail] Fix sources extraction (#13057) +* [dailymotion] Extend URL regular expression (#13079) + + +version 2017.05.14 + +Core ++ [extractor/common] Respect Width and Height attributes in ISM manifests ++ [postprocessor/metadatafromtitle] Add support regular expression syntax for + --metadata-from-title (#13065) + +Extractors ++ [mediaset] Add support for video.mediaset.it (#12708, #12964) +* [orf:radio] Fix extraction (#11643, #12926) +* [aljazeera] Extend URL regular expression (#13053) +* [imdb] Relax URL regular expression (#13056) ++ [francetv] Add support for mobile.france.tv (#13068) ++ [upskill] Add support for upskillcourses.com (#13043) +* [thescene] Fix extraction (#13061) +* [condenast] Improve embed support +* [liveleak] Fix extraction (#12053) ++ [douyu] Support Douyu shows (#12228) +* [myspace] Improve URL regular expression (#13040) +* [adultswim] Use desktop platform in assets URL (#13041) + + +version 2017.05.09 + +Core +* [YoutubeDL] Force --restrict-filenames when no locale is set on all python + versions (#13027) + +Extractors +* [francetv] Adapt to site redesign (#13034) ++ [packtpub] Add support for authentication (#12622) +* [drtv] Lower preference for SignLanguage formats (#13013, #13016) ++ [cspan] Add support for brightcove live embeds (#13028) +* [vrv] Extract DASH formats and subtitles +* [funimation] Fix authentication (#13021) +* [adultswim] Fix extraction (#8640, #10950, #11042, #12121) + + Add support for Adobe Pass authentication + + Add support for live streams + + Add support for show pages +* [turner] Extract thumbnail, is_live and strip description ++ [nonktube] Add support for nonktube.com (#8647, #13024) ++ [nuevo] Pass headers to _extract_nuevo +* [nbc] Improve extraction (#12364) + + +version 2017.05.07 + +Common +* [extractor/common] Fix typo in _extract_akamai_formats ++ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata ++ [extractor/common] Introduce chapters meta field + +Extractors +* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995, + #13003) +* [bilibili] Fix video downloading (#13001) +* [rmcdecouverte] Fix extraction (#12937) +* [theplatform] Extract chapters +* [bandcamp] Fix thumbnail extraction (#12980) +* [pornhub] Extend URL regular expression (#12996) ++ [youtube] Extract chapters ++ [nrk] Extract chapters ++ [vice] Add support for ooyala embeds in article pages ++ [vice] Support vice articles (#12968) +* [vice] Fix extraction for non en_us videos (#12967) +* [gdcvault] Fix extraction for some videos (#12733) +* [pbs] Improve multipart video support (#12981) +* [laola1tv] Fix extraction (#12880) ++ [cda] Support birthday verification (#12789) +* [leeco] Fix extraction (#12974) ++ [pbs] Extract chapters +* [amp] Imporove thumbnail and subtitles extraction +* [foxsports] Fix extraction (#12945) +- [coub] Remove comment count extraction (#12941) + + +version 2017.05.01 + +Core ++ [extractor/common] Extract view count from JSON-LD +* [utils] Improve unified_timestamp ++ [utils] Add video/mp2t to mimetype2ext +* [downloader/external] Properly handle live stream downloading cancellation + (#8932) ++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) + +Extractors +* [infoq] Make audio format extraction non fatal (#12938) +* [brightcove] Allow whitespace around attribute names in embedded code ++ [zaq1] Add support for zaq1.pl (#12693) ++ [xvideos] Extract duration (#12828) +* [vevo] Fix extraction (#12879) ++ [noovo] Add support for noovo.ca (#12792) ++ [washingtonpost] Add support for embeds (#12699) +* [yandexmusic:playlist] Fix extraction for python 3 (#12888) +* [anvato] Improve extraction (#12913) + * Promote to regular shortcut based extractor + * Add mcp to access key mapping table + * Add support for embeds extraction + * Add support for anvato embeds in generic extractor +* [xtube] Fix extraction for older FLV videos (#12734) +* [tvplayer] Fix extraction (#12908) + + +version 2017.04.28 + +Core ++ [adobepass] Use geo verification headers for all requests +- [downloader/fragment] Remove assert for resume_len when no fragments + downloaded ++ [extractor/common] Add manifest_url for explicit group rendition formats +* [extractor/common] Fix manifest_url for m3u8 formats +- [extractor/common] Don't list master m3u8 playlists in format list (#12832) + +Extractor +* [aenetworks] Fix extraction for shows with single season ++ [go] Add support for Disney, DisneyJunior and DisneyXD show pages +* [youtube] Recognize new locale-based player URLs (#12885) ++ [streamable] Add support for new embedded URL schema (#12844) +* [arte:+7] Relax URL regular expression (#12837) + + +version 2017.04.26 + +Core +* Introduce --keep-fragments for keeping fragments of fragmented download + on disk after download is finished +* [YoutubeDL] Fix output template for missing timestamp (#12796) +* [socks] Handle cases where credentials are required but missing +* [extractor/common] Improve HLS extraction (#12211) + * Extract m3u8 parsing to separate method + * Improve rendition groups extraction + * Build stream name according stream GROUP-ID + * Ignore reference to AUDIO group without URI when stream has no CODECS + * Use float for scaled tbr in _parse_m3u8_formats +* [utils] Add support for TTML styles in dfxp2srt +* [downloader/hls] No need to download keys for fragments that have been + already downloaded +* [downloader/fragment] Improve fragment downloading + * Resume immediately + * Don't concatenate fragments and decrypt them on every resume + * Optimize disk storage usage, don't store intermediate fragments on disk + * Store bookkeeping download state file ++ [extractor/common] Add support for multiple getters in try_get ++ [extractor/common] Add support for video of WebPage context in _json_ld + (#12778) ++ [extractor/common] Relax JWPlayer regular expression and remove + duplicate URLs (#12768) + +Extractors +* [iqiyi] Fix extraction of Yule videos +* [vidio] Improve extraction and sort formats ++ [brightcove] Match only video elements with data-video-id attribute +* [iqiyi] Fix playlist detection (#12504) +- [azubu] Remove extractor (#12813) +* [porn91] Fix extraction (#12814) +* [vidzi] Fix extraction (#12793) ++ [amp] Extract error message (#12795) ++ [xfileshare] Add support for gorillavid.com and daclips.com (#12776) +* [instagram] Fix extraction (#12777) ++ [generic] Support Brightcove videos in ', + webpage, 'embed url')) + if VKIE.suitable(embed_url): + return self.url_result(embed_url, VKIE.ie_key(), video_id) + + embed_page = self._download_webpage( + embed_url, video_id, headers={'Referer': url}) + video_ext = self._get_cookies(embed_url).get('video_ext') + if video_ext: + video_ext = compat_urllib_parse_unquote(video_ext.value) + if not video_ext: + video_ext = compat_b64decode(self._search_regex( + r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', + embed_page, 'video_ext')).decode() + video_id, sig, _, access_token = video_ext.split(':') + item = self._download_json( + 'https://api.vk.com/method/video.get', video_id, + headers={'User-Agent': 'okhttp/3.4.1'}, query={ + 'access_token': access_token, + 'sig': sig, + 'v': 5.44, + 'videos': video_id, + })['response']['items'][0] + title = item['title'] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + formats.append({ + 'format_id': height + 'p', + 'url': f_url, + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': item.get('description'), + 'duration': int_or_none(item.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('date')), + 'uploader': item.get('owner_id'), + 'view_count': int_or_none(item.get('views')), + } diff --git a/youtube_dlc/extractor/bitchute.py b/youtube_dlc/extractor/bitchute.py new file mode 100644 index 0000000..92fc70b --- /dev/null +++ b/youtube_dlc/extractor/bitchute.py @@ -0,0 +1,150 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + GeoRestrictedError, + orderedSet, + unified_strdate, + urlencode_postdata, +) + + +class BitChuteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', + 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', + 'info_dict': { + 'id': 'szoMrox2JEI', + 'ext': 'mp4', + 'title': 'Fuck bitches get money', + 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Victoria X Rave', + 'upload_date': '20170813', + }, + }, { + 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', + 'only_matching': True, + }, { + 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + }) + + title = self._html_search_regex( + (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', + default=None) or self._og_search_description(webpage) + + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + + formats = [ + {'url': format_url} + for format_url in orderedSet(format_urls)] + + if not formats: + entries = self._parse_html5_media_entries( + url, webpage, video_id) + if not entries: + error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') + if error == 'Video Unavailable': + raise GeoRestrictedError(error) + raise ExtractorError(error) + formats = entries[0]['formats'] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image:src', webpage, 'thumbnail') + uploader = self._html_search_regex( + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) + + upload_date = unified_strdate(self._search_regex( + r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', + webpage, 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'formats': formats, + } + + +class BitChuteChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.bitchute.com/channel/victoriaxrave/', + 'playlist_mincount': 185, + 'info_dict': { + 'id': 'victoriaxrave', + }, + } + + _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + + def _entries(self, channel_id): + channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id + offset = 0 + for page_num in itertools.count(1): + data = self._download_json( + '%sextend/' % channel_url, channel_id, + 'Downloading channel page %d' % page_num, + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': offset, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': channel_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': 'csrftoken=%s' % self._TOKEN, + }) + if data.get('success') is False: + break + html = data.get('html') + if not html: + break + video_ids = re.findall( + r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', + html) + if not video_ids: + break + offset += len(video_ids) + for video_id in video_ids: + yield self.url_result( + 'https://www.bitchute.com/video/%s' % video_id, + ie=BitChuteIE.ie_key(), video_id=video_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.playlist_result( + self._entries(channel_id), playlist_id=channel_id) diff --git a/youtube_dlc/extractor/bleacherreport.py b/youtube_dlc/extractor/bleacherreport.py new file mode 100644 index 0000000..dc60224 --- /dev/null +++ b/youtube_dlc/extractor/bleacherreport.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': '6a5cd403418c7b01719248ca97fb0692', + 'info_dict': { + 'id': '2586817', + 'ext': 'webm', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) + info['id'] = video_id + return info diff --git a/youtube_dlc/extractor/blinkx.py b/youtube_dlc/extractor/blinkx.py new file mode 100644 index 0000000..db5e12b --- /dev/null +++ b/youtube_dlc/extractor/blinkx.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + remove_start, + int_or_none, +) + + +class BlinkxIE(InfoExtractor): + _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' + IE_NAME = 'blinkx' + + _TEST = { + 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', + 'md5': '337cf7a344663ec79bf93a526a2e06c7', + 'info_dict': { + 'id': 'Da0Gw3xc', + 'ext': 'mp4', + 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', + 'uploader': 'IGN News', + 'upload_date': '20150217', + 'timestamp': 1424215740, + 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', + 'duration': 47.743333, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + display_id = video_id[:8] + + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + + 'video=%s' % video_id) + data_json = self._download_webpage(api_url, display_id) + data = json.loads(data_json)['api']['results'][0] + duration = None + thumbnails = [] + formats = [] + for m in data['media']: + if m['type'] == 'jpg': + thumbnails.append({ + 'url': m['link'], + 'width': int(m['w']), + 'height': int(m['h']), + }) + elif m['type'] == 'original': + duration = float(m['d']) + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen('Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) + elif m['type'] in ('flv', 'mp4'): + vcodec = remove_start(m['vcodec'], 'ff') + acodec = remove_start(m['acodec'], 'ff') + vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) + abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) + tbr = vbr + abr if vbr and abr else None + format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) + formats.append({ + 'format_id': format_id, + 'url': m['link'], + 'vcodec': vcodec, + 'acodec': acodec, + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(m.get('w')), + 'height': int_or_none(m.get('h')), + }) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'fullid': video_id, + 'title': data['title'], + 'formats': formats, + 'uploader': data['channel_name'], + 'timestamp': data['pubdate_epoch'], + 'description': data.get('description'), + 'thumbnails': thumbnails, + 'duration': duration, + } diff --git a/youtube_dlc/extractor/bloomberg.py b/youtube_dlc/extractor/bloomberg.py new file mode 100644 index 0000000..2fbfad1 --- /dev/null +++ b/youtube_dlc/extractor/bloomberg.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class BloombergIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', + # The md5 checksum changes + 'info_dict': { + 'id': 'qurhIVlJSB6hzkVi229d8g', + 'ext': 'flv', + 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', + 'description': 'md5:a8ba0302912d03d246979735c17d2761', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # video ID in BPlayer(...) + 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', + 'info_dict': { + 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', + 'ext': 'flv', + 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', + 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { + # data-bmmrid= + 'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money', + 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', + 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, + }] + + def _real_extract(self, url): + name = self._match_id(url) + webpage = self._download_webpage(url, name) + video_id = self._search_regex( + (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'), + webpage, 'id', group='id', default=None) + if not video_id: + bplayer_data = self._parse_json(self._search_regex( + r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + video_id = bplayer_data['id'] + title = re.sub(': Video$', '', self._og_search_title(webpage)) + + embed_info = self._download_json( + 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + formats = [] + for stream in embed_info['streams']: + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dlc/extractor/bokecc.py b/youtube_dlc/extractor/bokecc.py new file mode 100644 index 0000000..6017e83 --- /dev/null +++ b/youtube_dlc/extractor/bokecc.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', + webpage, 'player params', group='query') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', + 'info_dict': { + 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } diff --git a/youtube_dlc/extractor/bostonglobe.py b/youtube_dlc/extractor/bostonglobe.py new file mode 100644 index 0000000..57882fb --- /dev/null +++ b/youtube_dlc/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> + entries = [] + for video in re.findall(r'(?i)(<video[^>]+>)', webpage): + attrs = extract_attributes(video) + + video_id = attrs.get('data-brightcove-video-id') + account_id = attrs.get('data-account') + player_id = attrs.get('data-player') + embed = attrs.get('data-embed') + + if video_id and account_id and player_id and embed: + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + if len(entries) == 0: + return self.url_result(url, 'Generic') + elif len(entries) == 1: + return self.url_result(entries[0], 'BrightcoveNew') + else: + return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dlc/extractor/bpb.py b/youtube_dlc/extractor/bpb.py new file mode 100644 index 0000000..0783353 --- /dev/null +++ b/youtube_dlc/extractor/bpb.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + determine_ext, +) + + +class BpbIE(InfoExtractor): + IE_DESC = 'Bundeszentrale für politische Bildung' + _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' + + _TEST = { + 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', + # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 + 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', + 'info_dict': { + 'id': '297', + 'ext': 'mp4', + 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', + 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h2 class="white">(.*?)</h2>', webpage, 'title') + video_info_dicts = re.findall( + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) + + formats = [] + for video_info in video_info_dicts: + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' + formats.append({ + 'url': video_url, + 'preference': 10 if quality == 'high' else 0, + 'format_note': quality, + 'format_id': '%s-%s' % (quality, determine_ext(video_url)), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dlc/extractor/br.py b/youtube_dlc/extractor/br.py new file mode 100644 index 0000000..9bde7f2 --- /dev/null +++ b/youtube_dlc/extractor/br.py @@ -0,0 +1,311 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_duration, + parse_iso8601, + xpath_element, + xpath_text, +) + + +class BRIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' + + _TESTS = [ + { + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', + 'info_dict': { + 'id': '48f656ef-287e-486f-be86-459122db22cc', + 'ext': 'mp4', + 'title': 'Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', + 'info_dict': { + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'flv', + 'title': 'Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', + 'duration': 26, + }, + 'skip': '404 not found', + }, + { + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', + 'ext': 'aac', + 'title': 'Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', + 'duration': 296, + }, + 'skip': '404 not found', + }, + { + 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', + 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', + 'info_dict': { + 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', + 'ext': 'mp4', + 'title': 'Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', + 'duration': 116, + } + }, + { + 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', + 'md5': '23bca295f1650d698f94fc570977dae3', + 'info_dict': { + 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', + 'ext': 'mp4', + 'title': 'Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', + 'duration': 893, + 'uploader': 'Eva Maria Steimle', + 'upload_date': '20170208', + } + }, + ] + + def _real_extract(self, url): + base_url, display_id = re.search(self._VALID_URL, url).groups() + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') + xml = self._download_xml(base_url + xml_url, display_id) + + medias = [] + + for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') + media = { + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), + } + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) + medias.append(media) + + if len(medias) > 1: + self._downloader.report_warning( + 'found multiple medias; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not medias: + raise ExtractorError('No media entries found') + return medias[0] + + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type.startswith('HDS'): + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) + elif asset_type.startswith('HLS'): + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) + self._sort_formats(formats) + return formats + + def _extract_thumbnails(self, variants, base_url): + thumbnails = [{ + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] + thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) + return thumbnails + + +class BRMediathekIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' + + _TESTS = [{ + 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', + 'md5': 'fdc3d485835966d1622587d08ba632ec', + 'info_dict': { + 'id': 'av:5a1e6a6e8fce6d001871cc8e', + 'ext': 'mp4', + 'title': 'Die Sendung vom 28.11.2017', + 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', + 'timestamp': 1511942766, + 'upload_date': '20171129', + } + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + + clip = self._download_json( + 'https://proxy-base.master.mango.express/graphql', + clip_id, data=json.dumps({ + "query": """{ + viewer { + clip(id: "%s") { + title + description + duration + createdAt + ageRestriction + videoFiles { + edges { + node { + publicLocation + fileSize + videoProfile { + width + height + bitrate + encoding + } + } + } + } + captionFiles { + edges { + node { + publicLocation + } + } + } + teaserImages { + edges { + node { + imageFiles { + edges { + node { + publicLocation + width + height + } + } + } + } + } + } + } + } +}""" % clip_id}).encode(), headers={ + 'Content-Type': 'application/json', + })['data']['viewer']['clip'] + title = clip['title'] + + formats = [] + for edge in clip.get('videoFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + ext = determine_ext(n_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + n_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + video_profile = node.get('videoProfile', {}) + tbr = int_or_none(video_profile.get('bitrate')) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': n_url, + 'width': int_or_none(video_profile.get('width')), + 'height': int_or_none(video_profile.get('height')), + 'tbr': tbr, + 'filesize': int_or_none(node.get('fileSize')), + }) + self._sort_formats(formats) + + subtitles = {} + for edge in clip.get('captionFiles', {}).get('edges', []): + node = edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + subtitles.setdefault('de', []).append({ + 'url': n_url, + }) + + thumbnails = [] + for edge in clip.get('teaserImages', {}).get('edges', []): + for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): + node = image_edge.get('node', {}) + n_url = node.get('publicLocation') + if not n_url: + continue + thumbnails.append({ + 'url': n_url, + 'width': int_or_none(node.get('width')), + 'height': int_or_none(node.get('height')), + }) + + return { + 'id': clip_id, + 'title': title, + 'description': clip.get('description'), + 'duration': int_or_none(clip.get('duration')), + 'timestamp': parse_iso8601(clip.get('createdAt')), + 'age_limit': int_or_none(clip.get('ageRestriction')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/youtube_dlc/extractor/bravotv.py b/youtube_dlc/extractor/bravotv.py new file mode 100644 index 0000000..b9715df --- /dev/null +++ b/youtube_dlc/extractor/bravotv.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, +) + + +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', + 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', + 'info_dict': { + 'id': 'epL0pmK1kQlT', + 'ext': 'mp4', + 'title': 'The Top Chef Season 16 Winner Is...', + 'description': 'Find out who takes the title of Top Chef!', + 'uploader': 'NBCU-BRAV', + 'upload_date': '20190314', + 'timestamp': 1552591860, + } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('ls_tve') + if tve: + query['manifest'] = 'm3u' + mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) + if mobj: + account_pid, tp_path = mobj.groups() + release_pid = tp_path.strip('/').split('/')[-1] + else: + account_pid = 'HNK2IC' + tp_path = release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('tve_adobe_auth', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'bravo'), + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + else: + shared_playlist = settings['ls_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + tp_path = release_pid = metadata.get('release_pid') + if not release_pid: + release_pid = metadata['guid'] + tp_path = 'media/guid/2140479951/' + release_pid + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), + query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info diff --git a/youtube_dlc/extractor/breakcom.py b/youtube_dlc/extractor/breakcom.py new file mode 100644 index 0000000..68c7cf2 --- /dev/null +++ b/youtube_dlc/extractor/breakcom.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + int_or_none, + url_or_none, +) + + +class BreakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', + 'info_dict': { + 'id': '2468056', + 'ext': 'mp4', + 'title': 'When Girls Act Like D-Bags', + 'age_limit': 13, + }, + }, { + # youtube embed + 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', + 'info_dict': { + 'id': 'RrrDLdeL2HQ', + 'ext': 'mp4', + 'title': 'Whale Watching Boat Crashing Into San Diego Dock', + 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', + 'upload_date': '20160331', + 'uploader': 'Steve Holden', + 'uploader_id': 'sdholden07', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage(url, display_id) + + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + content = self._parse_json( + self._search_regex( + r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, + 'content'), + display_id) + + formats = [] + for video in content: + video_url = url_or_none(video.get('url')) + if not video_url: + continue + bitrate = int_or_none(self._search_regex( + r'(\d+)_kbps', video_url, 'tbr', default=None)) + formats.append({ + 'url': video_url, + 'format_id': 'http-%d' % bitrate if bitrate else 'http', + 'tbr': bitrate, + }) + self._sort_formats(formats) + + title = self._search_regex( + (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') + + def get(key, name): + return int_or_none(self._search_regex( + r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, + default=None)) + + age_limit = get('ratings', 'age limit') + video_id = video_id or get('pid', 'video id') or display_id + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py new file mode 100644 index 0000000..2aa9f47 --- /dev/null +++ b/youtube_dlc/extractor/brightcove.py @@ -0,0 +1,677 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re +import struct + +from .adobepass import AdobePassIE +from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, + compat_xml_parse_error, +) +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + float_or_none, + int_or_none, + js_to_json, + mimetype2ext, + parse_iso8601, + smuggle_url, + str_or_none, + unescapeHTML, + unsmuggle_url, + UnsupportedError, + update_url_query, + url_or_none, +) + + +class BrightcoveLegacyIE(InfoExtractor): + IE_NAME = 'brightcove:legacy' + _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' + + _TESTS = [ + { + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + 'md5': '5423e113865d26e40624dce2e4b45d95', + 'note': 'Test Brightcove downloads and detection in GenericIE', + 'info_dict': { + 'id': '2371591881001', + 'ext': 'mp4', + 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + 'uploader': '8TV', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': '1589608506001', + }, + 'skip': 'The player has been deactivated by the content owner', + }, + { + # From http://medianetwork.oracle.com/video/player/1785452137001 + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + 'info_dict': { + 'id': '1785452137001', + 'ext': 'flv', + 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', + 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': '1460825906', + }, + 'skip': 'video not playable', + }, + { + # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ + 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', + 'info_dict': { + 'id': '2750934548001', + 'ext': 'mp4', + 'title': 'This Bracelet Acts as a Personal Thermostat', + 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', + # 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': '1130468786001', + }, + }, + { + # test that the default referer works + # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ + 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', + 'info_dict': { + 'id': '2878862109001', + 'ext': 'mp4', + 'title': 'Lost in Motion II', + 'description': 'md5:363109c02998fee92ec02211bd8000df', + 'uploader': 'National Ballet of Canada', + }, + 'skip': 'Video gone', + }, + { + # test flv videos served by akamaihd.net + # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + # The md5 checksum changes on each download + 'info_dict': { + 'id': '3750436379001', + 'ext': 'flv', + 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'uploader': 'RBTV Old (do not use)', + 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': '710858724001', + }, + 'skip': 'Video gone', + }, + { + # playlist with 'videoList' + # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', + 'info_dict': { + 'title': 'Sealife', + 'id': '3550319591001', + }, + 'playlist_mincount': 7, + 'skip': 'Unsupported URL', + }, + { + # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + 'skip': 'Unsupported URL', + }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } + ] + + @classmethod + def _build_brighcove_url(cls, object_str): + """ + Build a Brightcove url from a xml string containing + <object class="BrightcoveExperience">{params}</object> + """ + + # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 + object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', + lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 + object_str = object_str.replace('<--', '<!--') + # remove namespace to simplify extraction + object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) + object_str = fix_xml_ampersands(object_str) + + try: + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + except compat_xml_parse_error: + return + + fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} + + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + + def find_param(name): + if name in flashvars: + return flashvars[name] + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return data_url_params.get(name) + + params = {} + + playerID = find_param('playerID') or find_param('playerId') + if playerID is None: + raise ExtractorError('Cannot find player ID') + params['playerID'] = playerID + + playerKey = find_param('playerKey') + # Not all pages define this value + if playerKey is not None: + params['playerKey'] = playerKey + # These fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') + if videoPlayer is not None: + if isinstance(videoPlayer, list): + videoPlayer = videoPlayer[0] + videoPlayer = videoPlayer.strip() + # UUID is also possible for videoPlayer (e.g. + # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd + # or http://www8.hp.com/cn/zh/home.html) + if not (re.match( + r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', + videoPlayer) or videoPlayer.startswith('ref:')): + return None + params['@videoPlayer'] = videoPlayer + linkBase = find_param('linkBaseURL') + if linkBase is not None: + params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove <object /> XML + # } + m = re.search( + r'''(?x)customBC\.createVideo\( + .*? # skipping width and height + ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID + ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P<videoID>\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): + return update_url_query( + 'http://c.brightcove.com/services/viewer/htmlFederated', params) + + @classmethod + def _extract_brightcove_url(cls, webpage): + """Try to extract the brightcove url from the webpage, returns None + if it can't be found + """ + urls = cls._extract_brightcove_urls(webpage) + return urls[0] if urls else None + + @classmethod + def _extract_brightcove_urls(cls, webpage): + """Return a list of all Brightcove URLs from the webpage """ + + url_m = re.search( + r'''(?x) + <meta\s+ + (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ + content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) + if url_m: + url = unescapeHTML(url_m.group('url')) + # Some sites don't add it, we can't download with this url, for example: + # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: + return [url] + + matches = re.findall( + r'''(?sx)<object + (?: + [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | + [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ + ).+?>\s*</object>''', + webpage) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + # Change the 'videoId' and others field to '@videoPlayer' + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) + # Change bckey (used by bcove.me urls) to playerKey + url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) + mobj = re.match(self._VALID_URL, url) + query_str = mobj.group('query') + query = compat_urlparse.parse_qs(query_str) + + videoPlayer = query.get('@videoPlayer') + if videoPlayer: + # We set the original url as the default 'Referer' header + referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) + video_id = videoPlayer[0] + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] + else: + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + headers = {} + if referer: + headers['Referer'] = referer + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + if referer: + brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + # TODO: figure out if it's possible to extract playlistId from playerKey + # elif 'playerKey' in query: + # player_key = query['playerKey'] + # return self._get_playlist_info(player_key[0]) + raise UnsupportedError(url) + + +class BrightcoveNewIE(AdobePassIE): + IE_NAME = 'brightcove:new' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' + _TESTS = [{ + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'mp4', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'duration': 165.768, + 'timestamp': 1441391203, + 'upload_date': '20150904', + 'uploader_id': '929656772001', + 'formats': 'mincount:20', + }, + }, { + # with rtmp streams + 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', + 'info_dict': { + 'id': '4279049078001', + 'ext': 'mp4', + 'title': 'Titansgrave: Chapter 0', + 'description': 'Titansgrave: Chapter 0', + 'duration': 1242.058, + 'timestamp': 1433556729, + 'upload_date': '20150606', + 'uploader_id': '4036320279001', + 'formats': 'mincount:39', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + # playlist stream + 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', + 'info_dict': { + 'id': '5718313430001', + 'title': 'No Audio Playlist', + }, + 'playlist_count': 7, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', + 'only_matching': True, + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, + }, { + # non numeric ref: prefixed video id + 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', + 'only_matching': True, + }, { + # unavailable video without message but with error_code + 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(ie, webpage): + # Reference: + # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + + entries = [] + + # Look for iframe embeds [1] + for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) + + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? + ''', webpage): + attrs = extract_attributes(video) + + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue + + entries.append(bc_url) + + return entries + + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): + title = json_data['name'].strip() + + formats = [] + for source in json_data.get('sources', []): + container = source.get('container') + ext = mimetype2ext(source.get('type')) + src = source.get('src') + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + continue + elif ext == 'm3u8' or container == 'M2TS': + if not src: + continue + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) + f = { + 'tbr': tbr, + 'filesize': int_or_none(source.get('size')), + 'container': container, + 'ext': ext or container.lower(), + } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'source_preference': 0 if src else -1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) + if not formats: + # for sonyliv.com DRM protected videos + s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') + if s3_source_url: + formats.append({ + 'url': s3_source_url, + 'format_id': 'source', + }) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + + self._sort_formats(formats) + + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('kind') != 'captions': + continue + text_track_url = url_or_none(text_track.get('src')) + if not text_track_url: + continue + lang = (str_or_none(text_track.get('srclang')) + or str_or_none(text_track.get('label')) or 'en').lower() + subtitles.setdefault(lang, []).append({ + 'url': text_track_url, + }) + + is_live = False + duration = float_or_none(json_data.get('duration'), 1000) + if duration is not None and duration <= 0: + is_live = True + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': clean_html(json_data.get('description')), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': duration, + 'timestamp': parse_iso8601(json_data.get('published_at')), + 'uploader_id': json_data.get('account_id'), + 'formats': formats, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), + 'is_live': is_live, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + 'ip_blocks': smuggled_data.get('geo_ip_blocks'), + }) + + account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() + + policy_key_id = '%s_%s' % (account_id, player_id) + policy_key = self._downloader.cache.load('brightcove', policy_key_id) + policy_key_extracted = False + store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + + def extract_policy_key(): + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') + + store_pk(policy_key) + return policy_key + + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) + headers = {} + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), + }) + + for _ in range(2): + if not policy_key: + policy_key = extract_policy_key() + policy_key_extracted = True + headers['Accept'] = 'application/json;pk=%s' % policy_key + try: + json_data = self._download_json(api_url, video_id, headers=headers) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: + policy_key = None + store_pk(None) + continue + raise ExtractorError(message, expected=True) + raise + + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + + if content_type == 'playlist': + return self.playlist_result( + [self._parse_brightcove_metadata(vid, vid.get('id'), headers) + for vid in json_data.get('videos', []) if vid.get('id')], + json_data.get('id'), json_data.get('name'), + json_data.get('description')) + + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff --git a/youtube_dlc/extractor/businessinsider.py b/youtube_dlc/extractor/businessinsider.py new file mode 100644 index 0000000..73a57b1 --- /dev/null +++ b/youtube_dlc/extractor/businessinsider.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BusinessInsiderIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', + 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', + 'info_dict': { + 'id': 'cjGDb0X9', + 'ext': 'mp4', + 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", + 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', + 'upload_date': '20160611', + 'timestamp': 1465675620, + }, + }, { + 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', + 'md5': '43f438dbc6da0b89f5ac42f68529d84a', + 'info_dict': { + 'id': '5zJwd4FK', + 'ext': 'mp4', + 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', + 'description': 'md5:2af8975825d38a4fed24717bbe51db49', + 'upload_date': '20170705', + 'timestamp': 1499270528, + }, + }, { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex( + (r'data-media-id=["\']([a-zA-Z0-9]{8})', + r'id=["\']jwplayer_([a-zA-Z0-9]{8})', + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', + r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), + webpage, 'jwplatform id') + return self.url_result( + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=video_id) diff --git a/youtube_dlc/extractor/buzzfeed.py b/youtube_dlc/extractor/buzzfeed.py new file mode 100644 index 0000000..ec41109 --- /dev/null +++ b/youtube_dlc/extractor/buzzfeed.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .facebook import FacebookIE + + +class BuzzFeedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia', + 'info_dict': { + 'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss', + 'title': 'This Angry Ram Destroys A Punching Bag Like A Boss', + 'description': 'Rambro!', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'aVCR29aE_OQ', + 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', + 'upload_date': '20141024', + 'uploader_id': 'Buddhanz1', + 'uploader': 'Angry Ram', + } + }] + }, { + 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia', + 'params': { + 'skip_download': True, # Got enough YouTube download tests + }, + 'info_dict': { + 'id': 'look-at-this-cute-dog-omg', + 'description': 're:Munchkin the Teddy Bear is back ?!', + 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'mVmBL8B-In0', + 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', + 'upload_date': '20141124', + 'uploader_id': 'CindysMunchkin', + 'uploader': 're:^Munchkin the', + }, + }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + all_buckets = re.findall( + r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'', + webpage) + + entries = [] + for bd_json in all_buckets: + bd = json.loads(bd_json) + video = bd.get('video') or bd.get('progload_video') + if not video: + continue + entries.append(self.url_result(video['url'])) + + facebook_urls = FacebookIE._extract_urls(webpage) + entries.extend([ + self.url_result(facebook_url) + for facebook_url in facebook_urls]) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'entries': entries, + } diff --git a/youtube_dlc/extractor/byutv.py b/youtube_dlc/extractor/byutv.py new file mode 100644 index 0000000..0b11bf1 --- /dev/null +++ b/youtube_dlc/extractor/byutv.py @@ -0,0 +1,117 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + merge_dicts, + parse_duration, + url_or_none, +) + + +class BYUtvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' + _TESTS = [{ + # ooyalaVOD + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', + 'info_dict': { + 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', + 'display_id': 'studio-c-season-5-episode-5', + 'ext': 'mp4', + 'title': 'Season 5 Episode 5', + 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', + 'thumbnail': r're:^https?://.*', + 'duration': 1486.486, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + # dvr + 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', + 'info_dict': { + 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', + 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', + 'ext': 'mp4', + 'title': 'Pacific vs. BYU (4/12/19)', + 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', + 'duration': 11645, + }, + 'params': { + 'skip_download': True + }, + }, { + 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', + 'only_matching': True, + }, { + 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', + display_id, query={ + 'contentid': video_id, + 'channel': 'byutv', + 'x-byutv-context': 'web$US', + }, headers={ + 'x-byutv-context': 'web$US', + 'x-byutv-platformkey': 'xsaaw9c7y5', + }) + + ep = video.get('ooyalaVOD') + if ep: + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'display_id': display_id, + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + + info = {} + formats = [] + for format_id, ep in video.items(): + if not isinstance(ep, dict): + continue + video_url = url_or_none(ep.get('videoUrl')) + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + merge_dicts(info, { + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + }) + self._sort_formats(formats) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'formats': formats, + }) diff --git a/youtube_dlc/extractor/c56.py b/youtube_dlc/extractor/c56.py new file mode 100644 index 0000000..cac8fdc --- /dev/null +++ b/youtube_dlc/extractor/c56.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' + IE_NAME = '56.com' + _TESTS = [{ + 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', + 'md5': 'e59995ac63d0457783ea05f93f12a866', + 'info_dict': { + 'id': '93440716', + 'ext': 'flv', + 'title': '网事知多少 第32期:车怒', + 'duration': 283.813, + }, + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + text_id = mobj.group('textid') + + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + + page = self._download_json( + 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + + info = page['info'] + + formats = [ + { + 'format_id': f['type'], + 'filesize': int(f['filesize']), + 'url': f['url'] + } for f in info['rfiles'] + ] + self._sort_formats(formats) + + return { + 'id': info['vid'], + 'title': info['Subject'], + 'duration': int(info['duration']) / 1000.0, + 'formats': formats, + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/youtube_dlc/extractor/camdemy.py b/youtube_dlc/extractor/camdemy.py new file mode 100644 index 0000000..8f0c6c5 --- /dev/null +++ b/youtube_dlc/extractor/camdemy.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + clean_html, + parse_duration, + str_to_int, + unified_strdate, +) + + +class CamdemyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _TESTS = [{ + # single file + 'url': 'http://www.camdemy.com/media/5181/', + 'md5': '5a5562b6a98b37873119102e052e311b', + 'info_dict': { + 'id': '5181', + 'ext': 'mp4', + 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'ss11spring', + 'duration': 1591, + 'upload_date': '20130114', + 'view_count': int, + } + }, { + # With non-empty description + # webpage returns "No permission or not login" + 'url': 'http://www.camdemy.com/media/13885', + 'md5': '4576a3bb2581f86c61044822adbd1249', + 'info_dict': { + 'id': '13885', + 'ext': 'mp4', + 'title': 'EverCam + Camdemy QuickStart', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', + 'creator': 'evercam', + 'duration': 318, + } + }, { + # External source (YouTube) + 'url': 'http://www.camdemy.com/media/14842', + 'info_dict': { + 'id': '2vsYQzNIsJo', + 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', + 'upload_date': '20130211', + 'uploader': 'Hun Kim', + 'uploader_id': 'hunkimtutorials', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + src_from = self._html_search_regex( + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') + if src_from: + return self.url_result(src_from) + + oembed_obj = self._download_json( + 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + + title = oembed_obj['title'] + thumb_url = oembed_obj['thumbnail_url'] + video_folder = compat_urlparse.urljoin(thumb_url, 'video/') + file_list_doc = self._download_xml( + compat_urlparse.urljoin(video_folder, 'fileList.xml'), + video_id, 'Downloading filelist XML') + file_name = file_list_doc.find('./video/item/fileName').text + video_url = compat_urlparse.urljoin(video_folder, file_name) + + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumb_url, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, + 'view_count': view_count, + } + + +class CamdemyFolderIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)' + _TESTS = [{ + # links with trailing slash + 'url': 'http://www.camdemy.com/folder/450', + 'info_dict': { + 'id': '450', + 'title': '信號與系統 2012 & 2011 (Signals and Systems)', + }, + 'playlist_mincount': 145 + }, { + # links without trailing slash + # and multi-page + 'url': 'http://www.camdemy.com/folder/853', + 'info_dict': { + 'id': '853', + 'title': '科學計算 - 使用 Matlab' + }, + 'playlist_mincount': 20 + }, { + # with displayMode parameter. For testing the codes to add parameters + 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg', + 'info_dict': { + 'id': '853', + 'title': '科學計算 - 使用 Matlab' + }, + 'playlist_mincount': 20 + }] + + def _real_extract(self, url): + folder_id = self._match_id(url) + + # Add displayMode=list so that all links are displayed in a single page + parsed_url = list(compat_urlparse.urlparse(url)) + query = dict(compat_urlparse.parse_qsl(parsed_url[4])) + query.update({'displayMode': 'list'}) + parsed_url[4] = compat_urllib_parse_urlencode(query) + final_url = compat_urlparse.urlunparse(parsed_url) + + page = self._download_webpage(final_url, folder_id) + matches = re.findall(r"href='(/media/\d+/?)'", page) + + entries = [self.url_result('http://www.camdemy.com' + media_path) + for media_path in matches] + + folder_title = self._html_search_meta('keywords', page) + + return self.playlist_result(entries, folder_id, folder_title) diff --git a/youtube_dlc/extractor/cammodels.py b/youtube_dlc/extractor/cammodels.py new file mode 100644 index 0000000..1eb81b7 --- /dev/null +++ b/youtube_dlc/extractor/cammodels.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + url_or_none, +) + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + 'age_limit': 18 + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), + ) + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + + manifest = self._download_json( + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = url_or_none(media.get('location')) + if not media_url: + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ + 'ext': 'mp4', + # hls skips fragments, preferring rtmp + 'preference': -1, + }) + else: + continue + formats.append(f) + self._sort_formats(formats) + + return { + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, + 'age_limit': 18 + } diff --git a/youtube_dlc/extractor/camtube.py b/youtube_dlc/extractor/camtube.py new file mode 100644 index 0000000..b3be3bd --- /dev/null +++ b/youtube_dlc/extractor/camtube.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class CamTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', + 'info_dict': { + 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', + 'display_id': 'minafay-030618-1136-chaturbate-female', + 'ext': 'mp4', + 'title': 'minafay-030618-1136-chaturbate-female', + 'duration': 1274, + 'timestamp': 1528018608, + 'upload_date': '20180603', + 'age_limit': 18 + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_BASE = 'https://api.camtube.co' + + def _real_extract(self, url): + display_id = self._match_id(url) + + token = self._download_json( + '%s/rpc/session/new' % self._API_BASE, display_id, + 'Downloading session token')['token'] + + self._set_cookie('api.camtube.co', 'session', token) + + video = self._download_json( + '%s/recordings/%s' % (self._API_BASE, display_id), display_id, + headers={'Referer': url}) + + video_id = video['uuid'] + timestamp = unified_timestamp(video.get('createdAt')) + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('likeCount')) + creator = video.get('stageName') + + formats = [{ + 'url': '%s/recordings/%s/manifest.m3u8' + % (self._API_BASE, video_id), + 'format_id': 'hls', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': display_id, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'creator': creator, + 'formats': formats, + 'age_limit': 18 + } diff --git a/youtube_dlc/extractor/camwithher.py b/youtube_dlc/extractor/camwithher.py new file mode 100644 index 0000000..bbc5205 --- /dev/null +++ b/youtube_dlc/extractor/camwithher.py @@ -0,0 +1,89 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' + + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex( + r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') + + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') + description = self._html_search_regex( + r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) + + return { + 'id': flv_id, + 'url': rtmp_url, + 'ext': 'flv', + 'no_resume': True, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, + 'age_limit': 18 + } diff --git a/youtube_dlc/extractor/canalc2.py b/youtube_dlc/extractor/canalc2.py new file mode 100644 index 0000000..407cc80 --- /dev/null +++ b/youtube_dlc/extractor/canalc2.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class Canalc2IE(InfoExtractor): + IE_NAME = 'canalc2.tv' + _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.canalc2.tv/video/12163', + 'md5': '060158428b650f896c542dfbb3d6487f', + 'info_dict': { + 'id': '12163', + 'ext': 'mp4', + 'title': 'Terrasses du Numérique', + 'duration': 122, + }, + }, { + 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.canalc2.tv/video/%s' % video_id, video_id) + + title = self._html_search_regex( + r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', + webpage, 'title') + + formats = [] + for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): + if video_url.startswith('rtmp://'): + rtmp = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url) + formats.append({ + 'url': rtmp.group('url'), + 'format_id': 'rtmp', + 'ext': 'flv', + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + 'page_url': url, + }) + else: + formats.append({ + 'url': video_url, + 'format_id': 'http', + }) + + if formats: + info = { + 'formats': formats, + } + else: + info = self._parse_html5_media_entries(url, webpage, url)[0] + + self._sort_formats(info['formats']) + + info.update({ + 'id': video_id, + 'title': title, + 'duration': parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)), + }) + return info diff --git a/youtube_dlc/extractor/canalplus.py b/youtube_dlc/extractor/canalplus.py new file mode 100644 index 0000000..51c11cb --- /dev/null +++ b/youtube_dlc/extractor/canalplus.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + # ExtractorError, + # HEADRequest, + int_or_none, + qualities, + unified_strdate, +) + + +class CanalplusIE(InfoExtractor): + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' + _SITE_ID_MAP = { + 'mycanal': 'cplus', + 'piwiplus': 'teletoon', + } + + # Only works for direct mp4 URLs + _GEO_COUNTRIES = ['FR'] + + _TESTS = [{ + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', + 'info_dict': { + 'id': '1397061', + 'display_id': 'lolywood', + 'ext': 'mp4', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', + }, + }, { + # geo restricted, bypassed + 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', + 'info_dict': { + 'id': '1108190', + 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', + 'ext': 'mp4', + 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', + 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', + 'upload_date': '20140724', + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }] + + def _real_extract(self, url): + site, display_id, video_id = re.match(self._VALID_URL, url).groups() + + site_id = self._SITE_ID_MAP[site] + + info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) + video_data = self._download_json(info_url, video_id, 'Downloading video JSON') + + if isinstance(video_data, list): + video_data = [video for video in video_data if video.get('ID') == video_id][0] + media = video_data['MEDIA'] + infos = video_data['INFOS'] + + preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) + + # _, fmt_url = next(iter(media['VIDEOS'].items())) + # if '/geo' in fmt_url.lower(): + # response = self._request_webpage( + # HEADRequest(fmt_url), video_id, + # 'Checking if the video is georestricted') + # if '/blocage' in response.geturl(): + # raise ExtractorError( + # 'The video is not available in your country', + # expected=True) + + formats = [] + for format_id, format_url in media['VIDEOS'].items(): + if not format_url: + continue + if format_id == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + elif format_id == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) + else: + formats.append({ + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js + 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', + 'format_id': format_id, + 'preference': preference(format_id), + }) + self._sort_formats(formats) + + thumbnails = [{ + 'id': image_id, + 'url': image_url, + } for image_id, image_url in media.get('images', {}).items()] + + titrage = infos['TITRAGE'] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': '%s - %s' % (titrage['TITRE'], + titrage['SOUS_TITRE']), + 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), + 'thumbnails': thumbnails, + 'description': infos.get('DESCRIPTION'), + 'duration': int_or_none(infos.get('DURATION')), + 'view_count': int_or_none(infos.get('NB_VUES')), + 'like_count': int_or_none(infos.get('NB_LIKES')), + 'comment_count': int_or_none(infos.get('NB_COMMENTS')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/canvas.py b/youtube_dlc/extractor/canvas.py new file mode 100644 index 0000000..8667a0d --- /dev/null +++ b/youtube_dlc/extractor/canvas.py @@ -0,0 +1,368 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + strip_or_none, + float_or_none, + int_or_none, + merge_dicts, + parse_iso8601, + str_or_none, + url_or_none, +) + + +class CanvasIE(InfoExtractor): + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'md5': '68993eda72ef62386a15ea2cf3c93107', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'mp4', + 'title': 'Nachtwacht: De Greystook', + 'description': 'Nachtwacht: De Greystook', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.04, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], + }, { + 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'only_matching': True, + }] + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site_id'), mobj.group('id') + + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) + + # New API endpoint + if not data: + token = self._download_json( + '%s/tokens' % self._REST_API_BASE, video_id, + 'Downloading token', data=b'', + headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + data = self._download_json( + '%s/videos/%s' % (self._REST_API_BASE, video_id), + video_id, 'Downloading video JSON', fatal=False, query={ + 'vrtPlayerToken': token, + 'client': '%s@PROD' % site_id, + }, expected_status=400) + message = data.get('message') + if message and not data.get('title'): + if data.get('code') == 'AUTHENTICATION_REQUIRED': + self.raise_login_required(message) + raise ExtractorError(message, expected=True) + + title = data['title'] + description = data.get('description') + + formats = [] + for target in data['targetUrls']: + format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) + if not format_url or not format_type: + continue + format_type = format_type.upper() + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], + m3u8_id=format_type, fatal=False)) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id=format_type, fatal=False)) + elif format_type == 'HSS': + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + self._sort_formats(formats) + + subtitles = {} + subtitle_urls = data.get('subtitleUrls') + if isinstance(subtitle_urls, list): + for subtitle in subtitle_urls: + subtitle_url = subtitle.get('url') + if subtitle_url and subtitle.get('type') == 'CLOSED': + subtitles.setdefault('nl', []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': float_or_none(data.get('duration'), 1000), + 'thumbnail': data.get('posterImageUrl'), + 'subtitles': subtitles, + } + + +class CanvasEenIE(InfoExtractor): + IE_DESC = 'canvas.be and een.be' + _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', + 'md5': 'ed66976748d12350b118455979cca293', + 'info_dict': { + 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', + 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', + 'ext': 'flv', + 'title': 'De afspraak veilt voor de Warmste Week', + 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 49.02, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # with subtitles + 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', + 'info_dict': { + 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', + 'display_id': 'pieter-0167', + 'ext': 'mp4', + 'title': 'Pieter 0167', + 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2553.08, + 'subtitles': { + 'nl': [{ + 'ext': 'vtt', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Pagina niet gevonden', + }, { + 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', + 'info_dict': { + 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', + 'display_id': 'emma-pakt-thilly-aan', + 'ext': 'mp4', + 'title': 'Emma pakt Thilly aan', + 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 118.24, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['is not a supported codec'], + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(self._search_regex( + r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None)) + + video_id = self._html_search_regex( + r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': self._og_search_description(webpage), + } + + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # Available via old API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'info_dict': { + 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'mp4', + 'title': 'De zwarte weduwe', + 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # Only available via new API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', + 'info_dict': { + 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', + 'ext': 'mp4', + 'title': 'Aflevering 5', + 'description': 'Wie valt door de mand tijdens een missie?', + 'duration': 2967.06, + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 5, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '<snip>', + 'password': '<snip>', + }, + 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_data = { + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + } + + auth_info = self._gigya_login(auth_data) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + # When requesting a token, no actual token is returned, but the + # necessary cookies are set. + self._request_webpage( + 'https://token.vrt.be', + None, note='Requesting a token', errnote='Could not get a token', + headers={ + 'Content-Type': 'application/json', + 'Referer': 'https://www.vrt.be/vrtnu/', + }, + data=json.dumps({ + 'uid': auth_info['UID'], + 'uidsig': auth_info['UIDSignature'], + 'ts': auth_info['signatureTimestamp'], + 'email': auth_info['profile']['email'], + }).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, display_id) + + info = self._search_json_ld(webpage, display_id, default={}) + + # title is optional here since it may be extracted by extractor + # that is delegated from here + title = strip_or_none(self._html_search_regex( + r'(?ms)<h1 class="content__heading">(.+?)</h1>', + webpage, 'title', default=None)) + + description = self._html_search_regex( + r'(?ms)<div class="content__description">(.+?)</div>', + webpage, 'description', default=None) + + season = self._html_search_regex( + [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* + <span>seizoen\ (.+?)</span>\s* + </div>''', + r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], + webpage, 'season', default=None) + + season_number = int_or_none(season) + + episode_number = int_or_none(self._html_search_regex( + r'''(?xms)<div\ class="content__episode">\s* + <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> + </div>''', + webpage, 'episode_number', default=None)) + + release_date = parse_iso8601(self._html_search_regex( + r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', + webpage, 'release_date', default=None)) + + # If there's a ? or a # in the URL, remove them and everything after + clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') + securevideo_url = clean_url + '.mssecurevideo.json' + + try: + video = self._download_json(securevideo_url, display_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required() + raise + + # We are dealing with a '../<show>.relevant' URL + redirect_url = video.get('url') + if redirect_url: + return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + + # There is only one entry, but with an unknown key, so just get + # the first one + video_id = list(video.values())[0].get('videoid') + + return merge_dicts(info, { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season': season, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_date': release_date, + }) diff --git a/youtube_dlc/extractor/carambatv.py b/youtube_dlc/extractor/carambatv.py new file mode 100644 index 0000000..b57b86a --- /dev/null +++ b/youtube_dlc/extractor/carambatv.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + +from .videomore import VideomoreIE + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', + 'info_dict': { + 'id': '475222', + 'ext': 'flv', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': r're:^https?://.*\.jpg', + # duration reported by videomore is incorrect + 'duration': int, + }, + 'add_ie': [VideomoreIE.ie_key()], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id + if videomore_url: + title = self._og_search_title(webpage) + return { + '_type': 'url_transparent', + 'url': videomore_url, + 'ie_key': VideomoreIE.ie_key(), + 'title': title, + } + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/youtube_dlc/extractor/cartoonnetwork.py b/youtube_dlc/extractor/cartoonnetwork.py new file mode 100644 index 0000000..48b3361 --- /dev/null +++ b/youtube_dlc/extractor/cartoonnetwork.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .turner import TurnerBaseIE +from ..utils import int_or_none + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', + 'info_dict': { + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', + 'ext': 'mp4', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { + 'url': url, + 'site_name': 'CartoonNetwork', + 'auth_required': find_field('authType', 'auth type') != 'unauth', + }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info diff --git a/youtube_dlc/extractor/cbc.py b/youtube_dlc/extractor/cbc.py new file mode 100644 index 0000000..fd5ec60 --- /dev/null +++ b/youtube_dlc/extractor/cbc.py @@ -0,0 +1,497 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import json +import re +from xml.sax.saxutils import escape + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + js_to_json, + smuggle_url, + try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + orderedSet, + parse_duration, + parse_iso8601, + parse_age_limit, + strip_or_none, + int_or_none, + ExtractorError, +) + + +class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + # with mediaId + 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'md5': '97e24d09672fc4cf56256d6faa6c25bc', + 'info_dict': { + 'id': '2682904050', + 'ext': 'mp4', + 'title': 'Don Cherry – All-Stars', + 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', + 'timestamp': 1454463000, + 'upload_date': '20160203', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca + 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'md5': '0274a90b51a9b4971fe005c63f592f12', + 'info_dict': { + 'id': '2487345465', + 'ext': 'mp4', + 'title': 'Robin Williams freestyles on 90 Minutes Live', + 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', + 'upload_date': '19780210', + 'uploader': 'CBCC-NEW', + 'timestamp': 255977160, + }, + }, { + # multiple iframes + 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', + 'playlist': [{ + 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', + 'info_dict': { + 'id': '2680832926', + 'ext': 'mp4', + 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', + 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', + 'upload_date': '20160201', + 'timestamp': 1454342820, + 'uploader': 'CBCC-NEW', + }, + }, { + 'md5': '415a0e3f586113894174dfb31aa5bb1a', + 'info_dict': { + 'id': '2658915080', + 'ext': 'mp4', + 'title': 'Fly like an eagle!', + 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', + 'upload_date': '20150315', + 'timestamp': 1426443984, + 'uploader': 'CBCC-NEW', + }, + }], + 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', + }, + 'playlist_mincount': 6, + }] + + @classmethod + def suitable(cls, url): + return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'<title>([^<]+)', webpage, 'title', fatal=False) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + media_ids = [] + for media_id_re in ( + r']+src="[^"]+?mediaId=(\d+)"', + r']+\bid=["\']player-(\d+)', + r'guid["\']\s*:\s*["\'](\d+)'): + media_ids.extend(re.findall(media_id_re, webpage)) + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in orderedSet(media_ids)]) + return self.playlist_result( + entries, display_id, strip_or_none(title), + self._og_search_description(webpage)) + + +class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' + _TESTS = [{ + 'url': 'http://www.cbc.ca/player/play/2683190193', + 'md5': '64d25f841ddf4ddb28a235338af32e2c', + 'info_dict': { + 'id': '2683190193', + 'ext': 'mp4', + 'title': 'Gerry Runs a Sweat Shop', + 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', + 'timestamp': 1455071400, + 'upload_date': '20160210', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Geo-restricted to Canada', + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + }, + }, { + 'url': 'http://www.cbc.ca/player/play/2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { + 'force_smil_url': True + }), + 'id': video_id, + } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + _GEO_COUNTRIES = ['CA'] + _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' + _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' + _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcwatch' + + def _signature(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._API_KEY} + resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + # token + query = { + 'access_token': access_token, + 'apikey': self._API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) + return resp['signature'] + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + for _ in range(2): + try: + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + # Device token has expired, re-acquiring device token + self._register_device() + continue + raise + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if self._valid_device_token(): + return + device = self._downloader.cache.load( + 'cbcwatch', self._cache_device_key()) or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if self._valid_device_token(): + return + self._register_device() + + def _valid_device_token(self): + return self._device_id and self._device_token + + def _cache_device_key(self): + email, _ = self._get_login_info() + return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + + def _register_device(self): + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, 'Acquiring device token', + data=b'web') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + email, password = self._get_login_info() + if email and password: + signature = self._signature(email, password) + data = '{0}{1}web'.format( + escape(signature), escape(self._device_id)).encode() + url = self._API_BASE_URL + 'device/login' + result = self._download_xml( + url, None, data=data, + headers={'content-type': 'application/xml'}) + self._device_token = xpath_text(result, 'token', fatal=True) + else: + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', self._cache_device_key(), { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' + _TESTS = [{ + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + }, { + # geo-restricted to Canada, bypassable + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dlc/extractor/cbs.py b/youtube_dlc/extractor/cbs.py new file mode 100644 index 0000000..4a19a73 --- /dev/null +++ b/youtube_dlc/extractor/cbs.py @@ -0,0 +1,112 @@ +from __future__ import unicode_literals + +from .theplatform import ThePlatformFeedIE +from ..utils import ( + ExtractorError, + int_or_none, + find_xpath_attr, + xpath_element, + xpath_text, + update_url_query, +) + + +class CBSBaseIE(ThePlatformFeedIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: + cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) + if cc_e is not None: + cc_url = cc_e.get('value') + if cc_url: + subtitles.setdefault(subtitles_lang, []).append({ + 'ext': ext, + 'url': cc_url, + }) + return subtitles + + +class CBSIE(CBSBaseIE): + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + + _TESTS = [{ + 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'info_dict': { + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'ext': 'mp4', + 'title': 'Connect Chat feat. Garth Brooks', + 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, + }] + + def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': site, 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) + tp_release_url = 'http://link.theplatform.com/s/' + tp_path + + asset_types = [] + subtitles = {} + formats = [] + last_e = None + for item in items_data.findall('.//item'): + asset_type = xpath_text(item, 'assetType') + if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type: + continue + asset_types.append(asset_type) + query = { + 'mbr': 'true', + 'assetTypes': asset_type, + } + if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'): + query['formats'] = 'MPEG4,M3U' + elif asset_type in ('RTMP', 'WIFI', '3G'): + query['formats'] = 'MPEG4,FLV' + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query(tp_release_url, query), content_id, + 'Downloading %s SMIL data' % asset_type) + except ExtractorError as e: + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e + self._sort_formats(formats) + + info = self._extract_theplatform_metadata(tp_path, content_id) + info.update({ + 'id': content_id, + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info + + def _real_extract(self, url): + content_id = self._match_id(url) + return self._extract_video_info(content_id) diff --git a/youtube_dlc/extractor/cbsinteractive.py b/youtube_dlc/extractor/cbsinteractive.py new file mode 100644 index 0000000..6596e98 --- /dev/null +++ b/youtube_dlc/extractor/cbsinteractive.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .cbs import CBSIE +from ..utils import int_or_none + + +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P[^/?]+)' + _TESTS = [{ + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'info_dict': { + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', + 'uploader': 'Sarah Mitroff', + 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', + 'info_dict': { + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', + }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, + }] + + MPX_ACCOUNTS = { + 'cnet': 2198311517, + 'zdnet': 2387448114, + } + + def _real_extract(self, url): + site, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + + data_json = self._html_search_regex( + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", + webpage, 'data json') + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] + + video_id = vdata['mpxRefId'] + + title = vdata['title'] + author = vdata.get('author') + if author: + uploader = '%s %s' % (author['firstName'], author['lastName']) + uploader_id = author.get('id') + else: + uploader = None + uploader_id = None + + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) + info.update({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'duration': int_or_none(vdata.get('duration')), + 'uploader': uploader, + 'uploader_id': uploader_id, + }) + return info diff --git a/youtube_dlc/extractor/cbslocal.py b/youtube_dlc/extractor/cbslocal.py new file mode 100644 index 0000000..90852a9 --- /dev/null +++ b/youtube_dlc/extractor/cbslocal.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .anvato import AnvatoIE +from .sendtonews import SendtoNewsIE +from ..compat import compat_urlparse +from ..utils import ( + parse_iso8601, + unified_timestamp, +) + + +class CBSLocalIE(AnvatoIE): + _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' + + _TESTS = [{ + # Anvato backend + 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\KCBSTV', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\AOL', + 'Syndication\\Yahoo', + 'Syndication\\Tribune', + 'Syndication\\Curb.tv', + 'Content\\News' + ], + 'tags': ['CBS 2 News Evening'], + }, + }, { + # SendtoNews embed + 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588', + }, + 'playlist_count': 9, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sendtonews_url = SendtoNewsIE._extract_url(webpage) + if sendtonews_url: + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) + + timestamp = unified_timestamp(self._html_search_regex( + r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, + 'released date', default=None)) or parse_iso8601( + self._html_search_meta('uploadDate', webpage)) + + info_dict.update({ + 'display_id': display_id, + 'timestamp': timestamp, + }) + + return info_dict diff --git a/youtube_dlc/extractor/cbsnews.py b/youtube_dlc/extractor/cbsnews.py new file mode 100644 index 0000000..345debc --- /dev/null +++ b/youtube_dlc/extractor/cbsnews.py @@ -0,0 +1,147 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import zlib + +from .common import InfoExtractor +from .cbs import CBSIE +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) +from ..utils import ( + parse_duration, +) + + +class CBSNewsEmbedIE(CBSIE): + IE_NAME = 'cbsnews:embed' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', + 'only_matching': True, + }] + + def _real_extract(self, url): + item = self._parse_json(zlib.decompress(compat_b64decode( + compat_urllib_parse_unquote(self._match_id(url))), + -zlib.MAX_WBITS), None)['video']['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + +class CBSNewsIE(CBSIE): + IE_NAME = 'cbsnews' + IE_DESC = 'CBS News' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' + + _TESTS = [ + { + # 60 minutes + 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', + 'info_dict': { + 'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4', + 'ext': 'flv', + 'title': 'Artificial Intelligence, real-life applications', + 'description': 'md5:a7aaf27f1b4777244de8b0b442289304', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 317, + 'uploader': 'CBSI-NEW', + 'timestamp': 1476046464, + 'upload_date': '20161009', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', + 'info_dict': { + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', + 'ext': 'mp4', + 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '20140404', + 'timestamp': 1396650660, + 'uploader': 'CBSI-NEW', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 205, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # 48 hours + 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', + 'info_dict': { + 'title': 'Cold as Ice', + 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', + }, + 'playlist_mincount': 7, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for embed_url in re.findall(r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): + entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) + if entries: + return self.playlist_result( + entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), + playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + item = self._parse_json(self._html_search_regex( + r'CBSNEWS\.defaultPayload\s*=\s*({.+})', + webpage, 'video JSON info'), display_id)['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + +class CBSNewsLiveVideoIE(InfoExtractor): + IE_NAME = 'cbsnews:livevideo' + IE_DESC = 'CBS News Live Videos' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' + + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples + _TEST = { + 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', + 'info_dict': { + 'id': 'clinton-sanders-prepare-to-face-off-in-nh', + 'ext': 'mp4', + 'title': 'Clinton, Sanders Prepare To Face Off In NH', + 'duration': 334, + }, + 'skip': 'Video gone', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + video_info = self._download_json( + 'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={ + 'device': 'desktop', + 'dvr_slug': display_id, + }) + + formats = self._extract_akamai_formats(video_info['url'], display_id) + self._sort_formats(formats) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': video_info['headline'], + 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), + 'duration': parse_duration(video_info.get('segmentDur')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cbssports.py b/youtube_dlc/extractor/cbssports.py new file mode 100644 index 0000000..83b7647 --- /dev/null +++ b/youtube_dlc/extractor/cbssports.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +from .cbs import CBSBaseIE + + +class CBSSportsIE(CBSBaseIE): + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', + 'info_dict': { + 'id': '1214315075735', + 'ext': 'mp4', + 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', + 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', + 'timestamp': 1524111457, + 'upload_date': '20180419', + 'uploader': 'CBSI-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'only_matching': True, + }] + + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], + webpage, 'video id') + return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dlc/extractor/ccc.py b/youtube_dlc/extractor/ccc.py new file mode 100644 index 0000000..36e6dff --- /dev/null +++ b/youtube_dlc/extractor/ccc.py @@ -0,0 +1,111 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, + url_or_none, +) + + +class CCCIE(InfoExtractor): + IE_NAME = 'media.ccc.de' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', + 'md5': '3a1eda8f3a29515d27f5adb967d7e740', + 'info_dict': { + 'id': '1839', + 'ext': 'mp4', + 'title': 'Introduction to Processor Design', + 'creator': 'byterazor', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20131228', + 'timestamp': 1388188800, + 'duration': 3710, + 'tags': list, + } + }, { + 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) + + formats = [] + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None + ) + formats.append({ + 'format_id': format_id, + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + return { + 'id': event_id, + 'display_id': display_id, + 'title': event_data['title'], + 'creator': try_get(event_data, lambda x: ', '.join(x['persons'])), + 'description': event_data.get('description'), + 'thumbnail': event_data.get('thumb_url'), + 'timestamp': parse_iso8601(event_data.get('date')), + 'duration': int_or_none(event_data.get('length')), + 'tags': event_data.get('tags'), + 'formats': formats, + } + + +class CCCPlaylistIE(InfoExtractor): + IE_NAME = 'media.ccc.de:lists' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://media.ccc.de/c/30c3', + 'info_dict': { + 'title': '30C3', + 'id': '30c3', + }, + 'playlist_count': 135, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url).lower() + + conf = self._download_json( + 'https://media.ccc.de/public/conferences/' + playlist_id, + playlist_id) + + entries = [] + for e in conf['events']: + event_url = url_or_none(e.get('frontend_link')) + if event_url: + entries.append(self.url_result(event_url, ie=CCCIE.ie_key())) + + return self.playlist_result(entries, playlist_id, conf.get('title')) diff --git a/youtube_dlc/extractor/ccma.py b/youtube_dlc/extractor/ccma.py new file mode 100644 index 0000000..544647f --- /dev/null +++ b/youtube_dlc/extractor/ccma.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + parse_iso8601, + parse_resolution, + url_or_none, +) + + +class CCMAIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + 'md5': '7296ca43977c8ea4469e719c609b0871', + 'info_dict': { + 'id': '5630208', + 'ext': 'mp4', + 'title': 'L\'espot de La Marató de TV3', + 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', + 'timestamp': 1470918540, + 'upload_date': '20160811', + } + }, { + 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + 'md5': 'fa3e38f269329a278271276330261425', + 'info_dict': { + 'id': '943685', + 'ext': 'mp3', + 'title': 'El Consell de Savis analitza el derbi', + 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', + 'upload_date': '20171205', + 'timestamp': 1512507300, + } + }] + + def _real_extract(self, url): + media_type, media_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'media': media_type, + 'idint': media_id, + }) + + formats = [] + media_url = media['media']['url'] + if isinstance(media_url, list): + for format_ in media_url: + format_url = url_or_none(format_.get('file')) + if not format_url: + continue + label = format_.get('label') + f = parse_resolution(label) + f.update({ + 'url': format_url, + 'format_id': label, + }) + formats.append(f) + else: + formats.append({ + 'url': media_url, + 'vcodec': 'none' if media_type == 'audio' else None, + }) + self._sort_formats(formats) + + informacio = media['informacio'] + title = informacio['titol'] + durada = informacio.get('durada', {}) + duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) + timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + + subtitles = {} + subtitols = media.get('subtitols', {}) + if subtitols: + sub_url = subtitols.get('url') + if sub_url: + subtitles.setdefault( + subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + 'url': sub_url, + }) + + thumbnails = [] + imatges = media.get('imatges', {}) + if imatges: + thumbnail_url = imatges.get('url') + if thumbnail_url: + thumbnails = [{ + 'url': thumbnail_url, + 'width': int_or_none(imatges.get('amplada')), + 'height': int_or_none(imatges.get('alcada')), + }] + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(informacio.get('descripcio')), + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cctv.py b/youtube_dlc/extractor/cctv.py new file mode 100644 index 0000000..c76f361 --- /dev/null +++ b/youtube_dlc/extractor/cctv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, + unified_timestamp, +) + + +class CCTVIE(InfoExtractor): + IE_DESC = '央视网' + _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' + _TESTS = [{ + # fo.addVariable("videoCenterId","id") + 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', + 'md5': 'd61ec00a493e09da810bf406a078f691', + 'info_dict': { + 'id': '5ecdbeab623f4973b40ff25f18b174e8', + 'ext': 'mp4', + 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', + 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', + 'duration': 98, + 'uploader': 'songjunjie', + 'timestamp': 1455279956, + 'upload_date': '20160212', + }, + }, { + # var guid = "id" + 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', + 'info_dict': { + 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', + 'ext': 'mp4', + 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', + 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', + 'duration': 37, + 'uploader': 'shujun', + 'timestamp': 1454677291, + 'upload_date': '20160205', + }, + 'params': { + 'skip_download': True, + }, + }, { + # changePlayer('id') + 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', + 'info_dict': { + 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', + 'ext': 'mp4', + 'title': 'NHnews008 ANNUAL POLITICAL SEASON', + 'description': 'Four Comprehensives', + 'duration': 60, + 'uploader': 'zhangyunlei', + 'timestamp': 1425385521, + 'upload_date': '20150303', + }, + 'params': { + 'skip_download': True, + }, + }, { + # loadvideo('id') + 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', + 'info_dict': { + 'id': 'b15f009ff45c43968b9af583fc2e04b2', + 'ext': 'mp4', + 'title': 'Путь,усыпанный космеями Серия 1', + 'description': 'Путь, усыпанный космеями', + 'duration': 2645, + 'uploader': 'renxue', + 'timestamp': 1477479241, + 'upload_date': '20161026', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var initMyAray = 'id' + 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', + 'info_dict': { + 'id': 'a194cfa7f18c426b823d876668325946', + 'ext': 'mp4', + 'title': '小泽征尔音乐塾 音乐梦想无国界', + 'duration': 2173, + 'timestamp': 1369248264, + 'upload_date': '20130522', + }, + 'params': { + 'skip_download': True, + }, + }, { + # var ids = ["id"] + 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', + 'info_dict': { + 'id': 'a8606119a4884588a79d81c02abecc16', + 'ext': 'mp3', + 'title': '来自维也纳的新年贺礼', + 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', + 'duration': 1578, + 'uploader': 'djy', + 'timestamp': 1482942419, + 'upload_date': '20161228', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', + 'only_matching': True, + }, { + 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', + r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', + r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', + r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', + r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], + webpage, 'video id') + + data = self._download_json( + 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, + query={ + 'pid': video_id, + 'url': url, + 'idl': 32, + 'idlr': 32, + 'modifyed': 'false', + }) + + title = data['title'] + + formats = [] + + video = data.get('video') + if isinstance(video, dict): + for quality, chapters_key in enumerate(('lowChapters', 'chapters')): + video_url = try_get( + video, lambda x: x[chapters_key][0]['url'], compat_str) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + 'quality': quality, + 'preference': -1, + }) + + hls_url = try_get(data, lambda x: x['hls_url'], compat_str) + if hls_url: + hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + + uploader = data.get('editer_name') + description = self._html_search_meta( + 'description', webpage, default=None) + timestamp = unified_timestamp(data.get('f_pgmtime')) + duration = float_or_none(try_get(video, lambda x: x['totalLength'])) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cda.py b/youtube_dlc/extractor/cda.py new file mode 100644 index 0000000..0c3af23 --- /dev/null +++ b/youtube_dlc/extractor/cda.py @@ -0,0 +1,182 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + multipart_encode, + parse_duration, + random_birthday, + urljoin, +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _BASE_URL = 'http://www.cda.pl/' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'description': 'md5:269ccd135d550da90d1662651fcb9772', + 'thumbnail': r're:^https?://.*\.jpg$', + 'average_rating': float, + 'duration': 39, + 'age_limit': 0, + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'crash404', + 'view_count': int, + 'average_rating': float, + 'duration': 137, + 'age_limit': 0, + } + }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('cda.pl', 'cda.player', 'html5') + webpage = self._download_webpage( + self._BASE_URL + '/video/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + need_confirm_age = False + if self._html_search_regex(r'(]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + + formats = [] + + uploader = self._search_regex(r'''(?x) + <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> + (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? + <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) + ''', webpage, 'uploader', default=None, group='uploader') + view_count = self._search_regex( + r'Odsłony:(?:\s| )*([0-9]+)', webpage, + 'view_count', default=None) + average_rating = self._search_regex( + r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', + webpage, 'rating', fatal=False, group='rating_value') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + 'view_count': int_or_none(view_count), + 'average_rating': float_or_none(average_rating), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, + } + + def extract_format(page, version): + json_str = self._html_search_regex( + r'player_data=(\\?["\'])(?P.+?)\1', page, + '%s player_json' % version, fatal=False, group='player_data') + if not json_str: + return + player_data = self._parse_json( + json_str, '%s player_data' % version, fatal=False) + if not player_data: + return + video = player_data.get('video') + if not video or 'file' not in video: + self.report_warning('Unable to extract %s version information' % version) + return + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + f = { + 'url': video['file'], + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(video.get('duration')) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( + self._BASE_URL + href, video_id, + 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dlc/extractor/ceskatelevize.py b/youtube_dlc/extractor/ceskatelevize.py new file mode 100644 index 0000000..7cb4efb --- /dev/null +++ b/youtube_dlc/extractor/ceskatelevize.py @@ -0,0 +1,289 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + sanitized_Request, + unescapeHTML, + update_url_query, + urlencode_postdata, + USER_AGENTS, +) + + +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494877246241', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Život v Grónsku', + 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'info_dict': { + 'id': '61924494877028507', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 81.3, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to Czech Republic', + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s

' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + type_ = None + episode_id = None + + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') + + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') + + data = { + 'playlist[0][type]': type_, + 'playlist[0][id]': episode_id, + 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestSource': 'iVysilani', + } + + entries = [] + + for user_agent in (None, USER_AGENTS['Safari']): + req = sanitized_Request( + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=urlencode_postdata(data)) + + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + if user_agent: + req.add_header('User-Agent', user_agent) + req.add_header('Referer', url) + + playlistpage = self._download_json(req, playlist_id, fatal=False) + + if not playlistpage: + continue + + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) + req.add_header('Referer', url) + + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) + + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + + playlist = playlist.get('playlist') + if not isinstance(playlist, list): + continue + + playlist_len = len(playlist) + + for num, item in enumerate(playlist): + is_live = item.get('type') == 'LIVE' + formats = [] + for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'drmOnly=true' in stream_url: + continue + if 'playerType=flash' in stream_url: + stream_formats = self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % format_id, fatal=False) + else: + stream_formats = self._extract_mpd_formats( + stream_url, playlist_id, + mpd_id='dash-%s' % format_id, fatal=False) + # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 + if format_id == 'audioDescription': + for f in stream_formats: + f['source_preference'] = -10 + formats.extend(stream_formats) + + if user_agent and len(entries) == playlist_len: + entries[num]['formats'].extend(formats) + continue + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + + entries.append({ + 'id': item_id, + 'title': final_title, + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + }) + + for e in entries: + self._sort_formats(e['formats']) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield '{0} --> {1}'.format(start, stop) + else: + yield line + + return '\r\n'.join(_fix_subtitle(subtitles)) + + +class CeskaTelevizePoradyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' + _TESTS = [{ + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = update_url_query(unescapeHTML(self._search_regex( + (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={ + 'autoStart': 'true', + }) + + return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dlc/extractor/channel9.py b/youtube_dlc/extractor/channel9.py new file mode 100644 index 0000000..09cacf6 --- /dev/null +++ b/youtube_dlc/extractor/channel9.py @@ -0,0 +1,262 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_iso8601, + qualities, + unescapeHTML, +) + + +class Channel9IE(InfoExtractor): + IE_DESC = 'Channel 9' + IE_NAME = 'channel9' + _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P.+?)(?P/RSS)?/?(?:[?#&]|$)' + + _TESTS = [{ + 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + 'md5': '32083d4eaf1946db6d454313f44510ca', + 'info_dict': { + 'id': '6c413323-383a-49dc-88f9-a22800cab024', + 'ext': 'wmv', + 'title': 'Developer Kick-Off Session: Stuff We Love', + 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', + 'duration': 4576, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1377717420, + 'upload_date': '20130828', + 'session_code': 'KOS002', + 'session_room': 'Arena 1A', + 'session_speakers': 'count:5', + }, + }, { + 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', + 'info_dict': { + 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', + 'ext': 'wmv', + 'title': 'Self-service BI with Power BI - nuclear testing', + 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', + 'duration': 1540, + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1386381991, + 'upload_date': '20131207', + 'authors': ['Mike Wilmot'], + }, + }, { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', + 'duration': 5646, + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20150930', + 'timestamp': 1443640735, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', + 'info_dict': { + 'id': 'Events/DEVintersection/DEVintersection-2016', + 'title': 'DEVintersection 2016 Orlando Sessions', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', + 'only_matching': True, + }, { + 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', + 'only_matching': True, + }] + + _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', + webpage) + + def _extract_list(self, video_id, rss_url=None): + if not rss_url: + rss_url = self._RSS_URL % video_id + rss = self._download_xml(rss_url, video_id, 'Downloading RSS') + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, video_id, title_text) + + def _real_extract(self, url): + content_path, rss = re.match(self._VALID_URL, url).groups() + + if rss: + return self._extract_list(content_path, url) + + webpage = self._download_webpage( + url, content_path, 'Downloading web page') + + episode_data = self._search_regex( + r"data-episode='([^']+)'", webpage, 'episode data', default=None) + if episode_data: + episode_data = self._parse_json(unescapeHTML( + episode_data), content_path) + content_id = episode_data['contentId'] + is_session = '/Sessions(' in episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' + if is_session: + content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' + else: + content_url += 'Authors,Body&$expand=Authors' + content_data = self._download_json(content_url, content_id) + title = content_data['Title'] + + QUALITIES = ( + 'mp3', + 'wmv', 'mp4', + 'wmv-low', 'mp4-low', + 'wmv-mid', 'mp4-mid', + 'wmv-high', 'mp4-high', + ) + + quality_key = qualities(QUALITIES) + + def quality(quality_id, format_url): + return (len(QUALITIES) if '_Source.' in format_url + else quality_key(quality_id)) + + formats = [] + urls = set() + + SITE_QUALITIES = { + 'MP3': 'mp3', + 'MP4': 'mp4', + 'Low Quality WMV': 'wmv-low', + 'Low Quality MP4': 'mp4-low', + 'Mid Quality WMV': 'wmv-mid', + 'Mid Quality MP4': 'mp4-mid', + 'High Quality WMV': 'wmv-high', + 'High Quality MP4': 'mp4-high', + } + + formats_select = self._search_regex( + r'(?s)]+name=["\']format[^>]+>(.+?)]+\bvalue=(["\'])(?P(?:(?!\1).)+)\1[^>]*>\s*(?P[^<]+?)\s*<', + formats_select): + format_url = mobj.group('url') + if format_url in urls: + continue + urls.add(format_url) + format_id = mobj.group('format') + quality_id = SITE_QUALITIES.get(format_id, format_id) + formats.append({ + 'url': format_url, + 'format_id': quality_id, + 'quality': quality(quality_id, format_url), + 'vcodec': 'none' if quality_id == 'mp3' else None, + }) + + API_QUALITIES = { + 'VideoMP4Low': 'mp4-low', + 'VideoWMV': 'wmv-mid', + 'VideoMP4Medium': 'mp4-mid', + 'VideoMP4High': 'mp4-high', + 'VideoWMVHQ': 'wmv-hq', + } + + for format_id, q in API_QUALITIES.items(): + q_url = content_data.get(format_id) + if not q_url or q_url in urls: + continue + urls.add(q_url) + formats.append({ + 'url': q_url, + 'format_id': q, + 'quality': quality(q, q_url), + }) + + self._sort_formats(formats) + + slides = content_data.get('Slides') + zip_file = content_data.get('ZipFile') + + if not formats and not slides and not zip_file: + raise ExtractorError( + 'None of recording, slides or zip are available for %s' % content_path) + + subtitles = {} + for caption in content_data.get('Captions', []): + caption_url = caption.get('Url') + if not caption_url: + continue + subtitles.setdefault(caption.get('Language', 'en'), []).append({ + 'url': caption_url, + 'ext': 'vtt', + }) + + common = { + 'id': content_id, + 'title': title, + 'description': clean_html(content_data.get('Description') or content_data.get('Body')), + 'thumbnail': content_data.get('VideoPlayerPreviewImage'), + 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), + 'timestamp': parse_iso8601(content_data.get('PublishedDate')), + 'avg_rating': int_or_none(content_data.get('Rating')), + 'rating_count': int_or_none(content_data.get('RatingCount')), + 'view_count': int_or_none(content_data.get('Views')), + 'comment_count': int_or_none(content_data.get('CommentCount')), + 'subtitles': subtitles, + } + if is_session: + speakers = [] + for s in content_data.get('Speakers', []): + speaker_name = s.get('FullName') + if not speaker_name: + continue + speakers.append(speaker_name) + + common.update({ + 'session_code': content_data.get('Code'), + 'session_room': content_data.get('Room'), + 'session_speakers': speakers, + }) + else: + authors = [] + for a in content_data.get('Authors', []): + author_name = a.get('DisplayName') + if not author_name: + continue + authors.append(author_name) + common['authors'] = authors + + contents = [] + + if slides: + d = common.copy() + d.update({'title': title + '-Slides', 'url': slides}) + contents.append(d) + + if zip_file: + d = common.copy() + d.update({'title': title + '-Zip', 'url': zip_file}) + contents.append(d) + + if formats: + d = common.copy() + d.update({'title': title, 'formats': formats}) + contents.append(d) + return self.playlist_result(contents) + else: + return self._extract_list(content_path) diff --git a/youtube_dlc/extractor/charlierose.py b/youtube_dlc/extractor/charlierose.py new file mode 100644 index 0000000..42c9af2 --- /dev/null +++ b/youtube_dlc/extractor/charlierose.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class CharlieRoseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P\d+)' + _TESTS = [{ + 'url': 'https://charlierose.com/videos/27996', + 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', + 'info_dict': { + 'id': '27996', + 'ext': 'mp4', + 'title': 'Remembering Zaha Hadid', + 'thumbnail': r're:^https?://.*\.jpg\?\d+', + 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, + }, + }, { + 'url': 'https://charlierose.com/videos/27996', + 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, + }] + + _PLAYER_BASE = 'https://charlierose.com/video/player/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) + + title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') + + info_dict = self._parse_html5_media_entries( + self._PLAYER_BASE % video_id, webpage, video_id, + m3u8_entry_protocol='m3u8_native')[0] + + self._sort_formats(info_dict['formats']) + self._remove_duplicate_formats(info_dict['formats']) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + }) + + return info_dict diff --git a/youtube_dlc/extractor/chaturbate.py b/youtube_dlc/extractor/chaturbate.py new file mode 100644 index 0000000..a459dcb --- /dev/null +++ b/youtube_dlc/extractor/chaturbate.py @@ -0,0 +1,109 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + lowercase_escape, + url_or_none, +) + + +class ChaturbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.chaturbate.com/siswet19/', + 'info_dict': { + 'id': 'siswet19', + 'ext': 'mp4', + 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Room is offline', + }, { + 'url': 'https://chaturbate.com/fullvideo/?b=caylin', + 'only_matching': True, + }, { + 'url': 'https://en.chaturbate.com/siswet19/', + 'only_matching': True, + }] + + _ROOM_OFFLINE = 'Room is currently offline' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://chaturbate.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) + + found_m3u8_urls = [] + + data = self._parse_json( + self._search_regex( + r'initialRoomDossier\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if data: + m3u8_url = url_or_none(data.get('hls_source')) + if m3u8_url: + found_m3u8_urls.append(m3u8_url) + + if not found_m3u8_urls: + for m in re.finditer( + r'(\\u002[27])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(lowercase_escape(m.group('url'))) + + if not found_m3u8_urls: + for m in re.finditer( + r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): + found_m3u8_urls.append(m.group('url')) + + m3u8_urls = [] + for found_m3u8_url in found_m3u8_urls: + m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '') + for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): + if m3u8_url not in m3u8_urls: + m3u8_urls.append(m3u8_url) + + if not m3u8_urls: + error = self._search_regex( + [r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', + r']+id=(["\'])defchat\1[^>]*>\s*

(?P[^<]+)<'], + webpage, 'error', group='error', default=None) + if not error: + if any(p in webpage for p in ( + self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): + error = self._ROOM_OFFLINE + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('Unable to find stream URL') + + formats = [] + for m3u8_url in m3u8_urls: + for known_id in ('fast', 'slow'): + if '_%s' % known_id in m3u8_url: + m3u8_id = known_id + break + else: + m3u8_id = None + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + # ffmpeg skips segments for fast m3u8 + preference=-10 if m3u8_id == 'fast' else None, + m3u8_id=m3u8_id, fatal=False, live=True)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, + 'age_limit': self._rta_search(webpage), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/chilloutzone.py b/youtube_dlc/extractor/chilloutzone.py new file mode 100644 index 0000000..5aac212 --- /dev/null +++ b/youtube_dlc/extractor/chilloutzone.py @@ -0,0 +1,96 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import compat_b64decode +from ..utils import ( + clean_html, + ExtractorError +) + + +class ChilloutzoneIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w|-]+)\.html' + _TESTS = [{ + 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'md5': 'a76f3457e813ea0037e5244f509e66d1', + 'info_dict': { + 'id': 'enemene-meck-alle-katzen-weg', + 'ext': 'mp4', + 'title': 'Enemene Meck - Alle Katzen weg', + 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + }, + }, { + 'note': 'Video hosted at YouTube', + 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'info_dict': { + 'id': '1YVQaAgHyRU', + 'ext': 'mp4', + 'title': '16 Photos Taken 1 Second Before Disaster', + 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', + 'uploader': 'BuzzFeedVideo', + 'uploader_id': 'BuzzFeedVideo', + 'upload_date': '20131105', + }, + }, { + 'note': 'Video hosted at Vimeo', + 'url': 'http://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'info_dict': { + 'id': '85523671', + 'ext': 'mp4', + 'title': 'The Sunday Times - Icons', + 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', + 'uploader': 'Us', + 'uploader_id': 'usfilms', + 'upload_date': '20140131' + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + base64_video_info = self._html_search_regex( + r'var cozVidData = "(.+?)";', webpage, 'video data') + decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') + video_info_dict = json.loads(decoded_video_info) + + # get video information from dict + video_url = video_info_dict['mediaUrl'] + description = clean_html(video_info_dict.get('description')) + title = video_info_dict['title'] + native_platform = video_info_dict['nativePlatform'] + native_video_id = video_info_dict['nativeVideoId'] + source_priority = video_info_dict['sourcePriority'] + + # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) + if native_platform is None: + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + + # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or + # the own CDN + if source_priority == 'native': + if native_platform == 'youtube': + return self.url_result(native_video_id, ie='Youtube') + if native_platform == 'vimeo': + return self.url_result( + 'http://vimeo.com/' + native_video_id, ie='Vimeo') + + if not video_url: + raise ExtractorError('No video found') + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'description': description, + } diff --git a/youtube_dlc/extractor/chirbit.py b/youtube_dlc/extractor/chirbit.py new file mode 100644 index 0000000..8d75cdf --- /dev/null +++ b/youtube_dlc/extractor/chirbit.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import parse_duration + + +class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://chirb.it/be2abG', + 'info_dict': { + 'id': 'be2abG', + 'ext': 'mp3', + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + 'uploader': 'Gerryaudio', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + data_fd = self._search_regex( + r'data-fd=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') + + title = self._search_regex( + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'

Description

\s*]*>([^<]+)', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) + uploader = self._search_regex( + r'id=["\']chirbit-username["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + + return { + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + } + + +class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' + _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'info_dict': { + 'id': 'ScarletBeauty', + }, + 'playlist_mincount': 3, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) + for _, video_id in re.findall(r']+id=([\'"])copy-btn-(?P[0-9a-zA-Z]+)\1', webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dlc/extractor/cinchcast.py b/youtube_dlc/extractor/cinchcast.py new file mode 100644 index 0000000..b861d54 --- /dev/null +++ b/youtube_dlc/extractor/cinchcast.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + xpath_text, +) + + +class CinchcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', + 'info_dict': { + 'id': '5258197', + 'ext': 'mp3', + 'title': 'Train Your Brain to Up Your Game with Coach Mandy', + 'upload_date': '20130816', + }, + }, { + # Actual test is run in generic, look for undergroundwellness + 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + doc = self._download_xml( + 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, + video_id) + + item = doc.find('.//item') + title = xpath_text(item, './title', fatal=True) + date_str = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}date') + upload_date = unified_strdate(date_str, day_first=False) + # duration is present but wrong + formats = [{ + 'format_id': 'main', + 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], + }] + backup_url = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}backupContent') + if backup_url: + formats.append({ + 'preference': 2, # seems to be more reliable + 'format_id': 'backup', + 'url': backup_url, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cinemax.py b/youtube_dlc/extractor/cinemax.py new file mode 100644 index 0000000..7f89d33 --- /dev/null +++ b/youtube_dlc/extractor/cinemax.py @@ -0,0 +1,29 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .hbo import HBOBaseIE + + +class CinemaxIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P[^/]+/video/[0-9a-z-]+-(?P\d+))' + _TESTS = [{ + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', + 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', + 'info_dict': { + 'id': '20126903', + 'ext': 'mp4', + 'title': 'S1 Ep 1: Recap', + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], + }, { + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) + info['id'] = video_id + return info diff --git a/youtube_dlc/extractor/ciscolive.py b/youtube_dlc/extractor/ciscolive.py new file mode 100644 index 0000000..da404e4 --- /dev/null +++ b/youtube_dlc/extractor/ciscolive.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveBaseIE(InfoExtractor): + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer, note=None): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) + + def _parse_rf_item(self, rf_item): + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, + 'description': description, + 'duration': duration, + 'creator': presenter_name, + 'location': location, + 'series': event_name, + } + + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P[^/?&]+)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', + 'only_matching': True, + }] + + def _real_extract(self, url): + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Search query', + }, + 'playlist_count': 5, + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + + def _real_extract(self, url): + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') diff --git a/youtube_dlc/extractor/cjsw.py b/youtube_dlc/extractor/cjsw.py new file mode 100644 index 0000000..505bdbe --- /dev/null +++ b/youtube_dlc/extractor/cjsw.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P[^/]+)/episode/(?P\d+)' + _TESTS = [{ + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', + 'ext': 'mp3', + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) + + webpage = self._download_webpage(url, episode_id) + + title = unescapeHTML(self._search_regex( + (r']+class=["\']episode-header__title["\'][^>]*>(?P[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + + formats = [{ + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), + 'vcodec': 'none', + }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'formats': formats, + 'series': series, + 'episode_id': episode_id, + } diff --git a/youtube_dlc/extractor/cliphunter.py b/youtube_dlc/extractor/cliphunter.py new file mode 100644 index 0000000..f2ca7a3 --- /dev/null +++ b/youtube_dlc/extractor/cliphunter.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + url_or_none, +) + + +class CliphunterIE(InfoExtractor): + IE_NAME = 'cliphunter' + + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ + (?P<id>[0-9]+)/ + (?P<seo>.+?)(?:$|[#\?]) + ''' + _TESTS = [{ + 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', + 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', + 'info_dict': { + 'id': '1012420', + 'ext': 'flv', + 'title': 'Fun Jynx Maze solo', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', + 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', + 'info_dict': { + 'id': '2019449', + 'ext': 'mp4', + 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._search_regex( + r'mediaTitle = "([^"]+)"', webpage, 'title') + + gexo_files = self._parse_json( + self._search_regex( + r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), + video_id) + + formats = [] + for format_id, f in gexo_files.items(): + video_url = url_or_none(f.get('url')) + if not video_url: + continue + fmt = f.get('fmt') + height = f.get('h') + format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': int_or_none(f.get('w')), + 'height': int_or_none(height), + 'tbr': int_or_none(f.get('br')), + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r"var\s+mov_thumb\s*=\s*'([^']+)';", + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'age_limit': self._rta_search(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/clippit.py b/youtube_dlc/extractor/clippit.py new file mode 100644 index 0000000..a1a7a77 --- /dev/null +++ b/youtube_dlc/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title.*>(.+?)', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/cliprs.py b/youtube_dlc/extractor/cliprs.py new file mode 100644 index 0000000..d55b26d --- /dev/null +++ b/youtube_dlc/extractor/cliprs.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .onet import OnetBaseIE + + +class ClipRsIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id + + return info_dict diff --git a/youtube_dlc/extractor/clipsyndicate.py b/youtube_dlc/extractor/clipsyndicate.py new file mode 100644 index 0000000..6cdb42f --- /dev/null +++ b/youtube_dlc/extractor/clipsyndicate.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, + fix_xml_ampersands +) + + +class ClipsyndicateIE(InfoExtractor): + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + + _TESTS = [{ + 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + 'md5': '4d7d549451bad625e0ff3d7bd56d776c', + 'info_dict': { + 'id': '4629301', + 'ext': 'mp4', + 'title': 'Brick Briscoe', + 'duration': 612, + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, { + 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + js_player = self._download_webpage( + 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, + video_id, 'Downlaoding player') + # it includes a required token + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') + + pdoc = self._download_xml( + 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, + video_id, 'Downloading video info', + transform_source=fix_xml_ampersands) + + track_doc = pdoc.find('trackList/track') + + def find_param(name): + node = find_xpath_attr(track_doc, './/param', 'name', name) + if node is not None: + return node.attrib['value'] + + return { + 'id': video_id, + 'title': find_param('title'), + 'url': track_doc.find('location').text, + 'thumbnail': find_param('thumbnail'), + 'duration': int(find_param('duration')), + } diff --git a/youtube_dlc/extractor/closertotruth.py b/youtube_dlc/extractor/closertotruth.py new file mode 100644 index 0000000..26243d5 --- /dev/null +++ b/youtube_dlc/extractor/closertotruth.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') + + title = self._search_regex( + r'(.+?)\s*\|\s*.+?', webpage, 'video title') + + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) + + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': title + } diff --git a/youtube_dlc/extractor/cloudflarestream.py b/youtube_dlc/extractor/cloudflarestream.py new file mode 100644 index 0000000..2fdcfbb --- /dev/null +++ b/youtube_dlc/extractor/cloudflarestream.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re + +from .common import InfoExtractor + + +class CloudflareStreamIE(InfoExtractor): + _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' + _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE + _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:watch\.)?%s/| + %s + ) + (?P<id>%s) + ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) + _TESTS = [{ + 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', + 'info_dict': { + 'id': '31c9291ab41fac05471db4e73aa11717', + 'ext': 'mp4', + 'title': '31c9291ab41fac05471db4e73aa11717', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', + 'only_matching': True, + }, { + 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', + 'only_matching': True, + }, { + 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' + base_url = 'https://%s/%s/' % (domain, video_id) + if '.' in video_id: + video_id = self._parse_json(base64.urlsafe_b64decode( + video_id.split('.')[1]), video_id)['sub'] + manifest_base_url = base_url + 'manifest/video.' + + formats = self._extract_m3u8_formats( + manifest_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cloudy.py b/youtube_dlc/extractor/cloudy.py new file mode 100644 index 0000000..85ca20e --- /dev/null +++ b/youtube_dlc/extractor/cloudy.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + unified_strdate, +) + + +class CloudyIE(InfoExtractor): + _IE_DESC = 'cloudy.ec' + _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'https://www.cloudy.ec/v/af511e2527aac', + 'md5': '29832b05028ead1b58be86bf319397ca', + 'info_dict': { + 'id': 'af511e2527aac', + 'ext': 'mp4', + 'title': 'Funny Cats and Animals Compilation june 2013', + 'upload_date': '20130913', + 'view_count': int, + } + }, { + 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.cloudy.ec/embed.php', video_id, query={ + 'id': video_id, + 'playerPage': 1, + 'autoplay': 1, + }) + + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + webpage = self._download_webpage( + 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) + + if webpage: + info.update({ + 'title': self._search_regex( + r'<h\d[^>]*>([^<]+)<', webpage, 'title'), + 'upload_date': unified_strdate(self._search_regex( + r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, + 'upload date', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r'([\d,.]+) views<', webpage, 'view count', fatal=False)), + }) + + if not info.get('title'): + info['title'] = video_id + + info['id'] = video_id + + return info diff --git a/youtube_dlc/extractor/clubic.py b/youtube_dlc/extractor/clubic.py new file mode 100644 index 0000000..98f9cb5 --- /dev/null +++ b/youtube_dlc/extractor/clubic.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + qualities, +) + + +class ClubicIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', + 'md5': '1592b694ba586036efac1776b0b43cd3', + 'info_dict': { + 'id': '448474', + 'ext': 'mp4', + 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', + 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', + 'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$', + } + }, { + 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_page = self._download_webpage(player_url, video_id) + + config = self._parse_json(self._search_regex( + r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, + 'configuration'), video_id) + + video_info = config['videoInfo'] + sources = config['sources'] + quality_order = qualities(['sd', 'hq']) + + formats = [{ + 'format_id': src['streamQuality'], + 'url': src['src'], + 'quality': quality_order(src['streamQuality']), + } for src in sources] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'description': clean_html(video_info.get('description')), + 'thumbnail': config.get('poster'), + } diff --git a/youtube_dlc/extractor/clyp.py b/youtube_dlc/extractor/clyp.py new file mode 100644 index 0000000..06d04de --- /dev/null +++ b/youtube_dlc/extractor/clyp.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + unified_timestamp, +) + + +class ClypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'https://clyp.it/ojz2wfah', + 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'info_dict': { + 'id': 'ojz2wfah', + 'ext': 'mp3', + 'title': 'Krisson80 - bits wip wip', + 'description': '#Krisson80BitsWipWip #chiptune\n#wip', + 'duration': 263.21, + 'timestamp': 1443515251, + 'upload_date': '20150929', + }, + }, { + 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', + 'info_dict': { + 'id': 'b04p1odi', + 'ext': 'mp3', + 'title': 'GJ! (Reward Edit)', + 'description': 'Metal Resistance (THE ONE edition)', + 'duration': 177.789, + 'timestamp': 1528241278, + 'upload_date': '20180605', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + token = qs.get('token', [None])[0] + + query = {} + if token: + query['token'] = token + + metadata = self._download_json( + 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) + + formats = [] + for secure in ('', 'Secure'): + for ext in ('Ogg', 'Mp3'): + format_id = '%s%s' % (secure, ext) + format_url = metadata.get('%sUrl' % format_id) + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = metadata['Title'] + description = metadata.get('Description') + duration = float_or_none(metadata.get('Duration')) + timestamp = unified_timestamp(metadata.get('DateCreated')) + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/cmt.py b/youtube_dlc/extractor/cmt.py new file mode 100644 index 0000000..a4ddb91 --- /dev/null +++ b/youtube_dlc/extractor/cmt.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .mtv import MTVIE + +# TODO Remove - Reason: Outdated Site + + +class CMTIE(MTVIE): + IE_NAME = 'cmt.com' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', + 'info_dict': { + 'id': '989124', + 'ext': 'mp4', + 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + 'description': 'Blame It All On My Roots', + }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, + }, { + 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', + 'only_matching': True, + }] + + def _extract_mgid(self, webpage, url): + mgid = self._search_regex( + r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', + webpage, 'mgid', group='mgid', default=None) + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage, url) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dlc/extractor/cnbc.py b/youtube_dlc/extractor/cnbc.py new file mode 100644 index 0000000..6889b0f --- /dev/null +++ b/youtube_dlc/extractor/cnbc.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dlc/extractor/cnn.py b/youtube_dlc/extractor/cnn.py new file mode 100644 index 0000000..774b710 --- /dev/null +++ b/youtube_dlc/extractor/cnn.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .turner import TurnerBaseIE +from ..utils import url_basename + + +class CNNIE(TurnerBaseIE): + _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' + + _TESTS = [{ + 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', + 'md5': '3e6121ea48df7e2259fe73a0628605c4', + 'info_dict': { + 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'ext': 'mp4', + 'title': 'Nadal wins 8th French Open title', + 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', + 'duration': 135, + 'upload_date': '20130609', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', + 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', + 'info_dict': { + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'ext': 'mp4', + 'title': "Student's epic speech stuns new freshmen", + 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", + 'upload_date': '20130821', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', + 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'info_dict': { + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'ext': 'mp4', + 'title': 'Nashville Ep. 1: Hand crafted skateboards', + 'description': 'md5:e7223a503315c9f150acac52e76de086', + 'upload_date': '20141222', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, + }, { + 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', + 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, + }] + + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + + def _real_extract(self, url): + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + } + }) + + +class CNNBlogsIE(InfoExtractor): + _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' + _TEST = { + 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', + 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', + 'info_dict': { + 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', + 'ext': 'mp4', + 'title': 'Criminalizing journalism?', + 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', + 'upload_date': '20140209', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') + return self.url_result(cnn_url, CNNIE.ie_key()) + + +class CNNArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' + _TEST = { + 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', + 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', + 'info_dict': { + 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', + 'ext': 'mp4', + 'title': 'Obama: Cyberattack not an act of war', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', + 'upload_date': '20141221', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'add_ie': ['CNN'], + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, url_basename(url)) + cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dlc/extractor/comedycentral.py b/youtube_dlc/extractor/comedycentral.py new file mode 100644 index 0000000..f54c4ad --- /dev/null +++ b/youtube_dlc/extractor/comedycentral.py @@ -0,0 +1,142 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor + + +class ComedyCentralIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) + /(?P<title>.*)''' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'info_dict': { + 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'ext': 'mp4', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', + 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', + }, + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'only_matching': True, + }] + + +class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ + (?:full-episodes|shows(?=/[^/]+/full-episodes)) + /(?P<id>[^?]+)''' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' + + _TESTS = [{ + 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', + 'info_dict': { + 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', + 'title': 'November 28, 2016 - Ryan Speedo Green', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + mgid = self._extract_mgid(webpage, url, data_zone='t2_lc_promo1') + videos_info = self._get_videos_info(mgid) + return videos_info + + +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + + _TESTS = [{ + 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': r're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'only_matching': True, + }] + + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'info_dict': { + 'id': 'local_playlist-f99b626bdfe13568579a', + 'ext': 'flv', + 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', + 'only_matching': True, + }, { + 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mrss_url = self._search_regex( + r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'mrss url', group='url') + + return self._get_videos_info_from_url(mrss_url, video_id) + + +class ComedyCentralShortnameIE(InfoExtractor): + _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' + _TESTS = [{ + 'url': ':tds', + 'only_matching': True, + }, { + 'url': ':thedailyshow', + 'only_matching': True, + }, { + 'url': ':theopposition', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + shortcut_map = { + 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + } + return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py new file mode 100644 index 0000000..4b42d69 --- /dev/null +++ b/youtube_dlc/extractor/common.py @@ -0,0 +1,3023 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import datetime +import hashlib +import json +import netrc +import os +import random +import re +import socket +import ssl +import sys +import time +import math + +from ..compat import ( + compat_cookiejar_Cookie, + compat_cookies, + compat_etree_Element, + compat_etree_fromstring, + compat_getpass, + compat_integer_types, + compat_http_client, + compat_os_name, + compat_str, + compat_urllib_error, + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, + compat_urllib_request, + compat_urlparse, + compat_xml_parse_error, +) +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) +from ..utils import ( + NO_DEFAULT, + age_restricted, + base_url, + bug_reports_message, + clean_html, + compiled_regex_type, + determine_ext, + determine_protocol, + dict_get, + error_to_compat_str, + ExtractorError, + extract_attributes, + fix_xml_ampersands, + float_or_none, + GeoRestrictedError, + GeoUtils, + int_or_none, + js_to_json, + JSON_LD_RE, + mimetype2ext, + orderedSet, + parse_bitrate, + parse_codecs, + parse_duration, + parse_iso8601, + parse_m3u8_attributes, + parse_resolution, + RegexNotFoundError, + sanitized_Request, + sanitize_filename, + str_or_none, + str_to_int, + strip_or_none, + unescapeHTML, + unified_strdate, + unified_timestamp, + update_Request, + update_url_query, + urljoin, + url_basename, + url_or_none, + xpath_element, + xpath_text, + xpath_with_ns, +) + + +class InfoExtractor(object): + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the YoutubeDL. The YoutubeDL processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The type field determines the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: + + id: Video identifier. + title: Video title, unescaped. + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url The mandatory URL representing the media: + for plain file media - HTTP URL of this file, + for RTMP - RTMP URL, + for HLS - URL of the M3U8 media playlist, + for HDS - URL of the F4M manifest, + for DASH + - HTTP URL to plain file media (in case of + unfragmented media) + - URL of the MPD manifest or base URL + representing the media if MPD manifest + is parsed from a string (in case of + fragmented media) + for MSS - URL of the ISM manifest. + * manifest_url + The URL of the manifest file in case of + fragmented media: + for HLS - URL of the M3U8 master playlist, + for HDS - URL of the F4M manifest, + for DASH - URL of the MPD manifest, + for MSS - URL of the ISM manifest. + * ext Will be calculated from URL if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * vbr Average video bitrate in KBit/s + * fps Frame rate + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "rtmpe", + "m3u8", "m3u8_native" or "http_dash_segments". + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url + * "duration" (optional, int or float) + * "filesize" (optional, int) + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader + + url: Final video URL. + ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + + The following fields are optional: + + alt_title: A secondary title of the video. + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID + * "url" + * "preference" (optional, int) - quality of the image + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height}", + deprecated) + * "filesize" (optional, int) + thumbnail: Full URL to a video thumbnail image. + description: Full video description. + uploader: Full name of the video uploader. + license: License name the video is licensed under. + creator: The creator of the video. + release_date: The date (YYYYMMDD) when the video was released. + timestamp: UNIX timestamp of the moment the video became available. + upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. + uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. + location: Physical location where the video was filmed. + subtitles: The available subtitles as a dictionary in the format + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions + duration: Length of the video in seconds, as an integer or float. + view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video + average_rating: Average rating give by users, the scale used depends on the webpage + comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The URL to the video webpage, if given to youtube-dlc it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) + + The following fields should only be used when the video belongs to some logical + chapter or section: + + chapter: Name or title of the chapter the video belongs to. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. + + The following fields should only be used when the video is an episode of some + series, programme or podcast: + + series: Title of the series or programme the video episode belongs to. + season: Title of the season the video episode belongs to. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. + episode: Title of the video episode. Unlike mandatory video title field, + this field should denote the exact title of the video episode + without any kind of decoration. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. + + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + + Unless mentioned otherwise, the fields should be Unicode strings. + + Unless mentioned otherwise, None is equivalent to absence of information. + + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + + Subclasses of this one should re-define the _real_initialize() and + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. + + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. + + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. + """ + + _ready = False + _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None + _WORKING = True + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader.""" + self._ready = False + self._x_forwarded_for_ip = None + self.set_downloader(downloader) + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None + + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return compat_str(m.group('id')) + + @classmethod + def working(cls): + """Getter method for _WORKING.""" + return cls._WORKING + + def initialize(self): + """Initializes an instance (authentication, etc).""" + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) + if not self._ready: + self._real_initialize() + self._ready = True + + def _initialize_geo_bypass(self, geo_bypass_context): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. + + You may also manually call it from extractor's code if geo bypass + information is not available beforehand (e.g. obtained during + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + + """ + if not self._x_forwarded_for_ip: + + # Geo bypass mechanism is explicitly disabled by user + if not self._downloader.params.get('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self._downloader.params.get('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s as X-Forwarded-For.' + % self._x_forwarded_for_ip) + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self._downloader.params.get('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country.upper())) + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + try: + for _ in range(2): + try: + self.initialize() + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise + except ExtractorError: + raise + except compat_http_client.IncompleteRead as e: + raise ExtractorError('A network error has occurred.', cause=e, expected=True) + except (KeyError, StopIteration) as e: + raise ExtractorError('An extractor error has occurred.', cause=e) + + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) + and self._GEO_BYPASS + and self._downloader.params.get('geo_bypass', True) + and not self._x_forwarded_for_ip + and countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + + def set_downloader(self, downloader): + """Sets the downloader for this IE.""" + self._downloader = downloader + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + pass + + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return compat_str(cls.__name__[:-2]) + + @property + def IE_NAME(self): + return compat_str(type(self).__name__[:-2]) + + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, compat_urllib_error.HTTPError) + if expected_status is None: + return False + if isinstance(expected_status, compat_integer_types): + return err.code == expected_status + elif isinstance(expected_status, (list, tuple)): + return err.code in expected_status + elif callable(expected_status): + return expected_status(err.code) is True + else: + assert False + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + if video_id is None: + self.to_screen('%s' % (note,)) + else: + self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: + if query: + url_or_request = update_url_query(url_or_request, query) + if data is not None or headers: + url_or_request = sanitized_Request(url_or_request, data, headers) + exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] + if hasattr(ssl, 'CertificateError'): + exceptions.append(ssl.CertificateError) + try: + return self._downloader.urlopen(url_or_request) + except tuple(exceptions) as err: + if isinstance(err, compat_urllib_error.HTTPError): + if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err + return err.fp + + if errnote is False: + return False + if errnote is None: + errnote = 'Unable to download webpage' + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self._downloader.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + See _download_webpage docstring for arguments specification. + """ + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + if urlh is False: + assert not fatal + return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + return (content, urlh) + + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' + + return encoding + + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked' in content + and 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'' + manifest_url = self._html_search_regex( + PLAYER_REGEX, webpage, 'manifest_url') + + partner_id = self._search_regex( + r'/p(?:artner_id)?/(\d+)', manifest_url, 'partner id', + default='1670711') + + kaltura_id = self._search_regex( + r'entry_id=(?P(?:[^&])+)', manifest_url, + 'kaltura id', group='id') + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': KalturaIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + } diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py new file mode 100644 index 0000000..aba06b3 --- /dev/null +++ b/youtube_dlc/extractor/generic.py @@ -0,0 +1,3459 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..compat import ( + compat_etree_fromstring, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, + compat_xml_parse_error, +) +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + HEADRequest, + is_html, + js_to_json, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, + orderedSet, + sanitized_Request, + smuggle_url, + unescapeHTML, + unified_strdate, + unsmuggle_url, + UnsupportedError, + xpath_text, +) +from .commonprotocols import RtmpIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nbc import NBCSportsVPlayerIE +from .ooyala import OoyalaIE +from .rutv import RUTVIE +from .tvc import TVCIE +from .sportbox import SportBoxIE +from .smotri import SmotriIE +from .myvi import MyviIE +from .condenast import CondeNastIE +from .udn import UDNEmbedIE +from .senateisvp import SenateISVPIE +from .svt import SVTIE +from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE +from .drtuber import DrTuberIE +from .redtube import RedTubeIE +from .tube8 import Tube8IE +from .mofosex import MofosexEmbedIE +from .spankwire import SpankwireIE +from .youporn import YouPornIE +from .vimeo import VimeoIE +from .dailymotion import DailymotionIE +from .dailymail import DailyMailIE +from .onionstudios import OnionStudiosIE +from .viewlift import ViewLiftEmbedIE +from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .videomore import VideomoreIE +from .webcaster import WebcasterFeedIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE +from .digiteka import DigitekaIE +from .arkena import ArkenaIE +from .instagram import InstagramIE +from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudEmbedIE +from .tunein import TuneInBaseIE +from .vbox7 import Vbox7IE +from .dbtv import DBTVIE +from .piksel import PikselIE +from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .videopress import VideoPressIE +from .rutube import RutubeIE +from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE +from .channel9 import Channel9IE +from .vshare import VShareIE +from .mediasite import MediasiteIE +from .springboardplatform import SpringboardPlatformIE +from .yapfiles import YapFilesIE +from .vice import ViceIE +from .xfileshare import XFileShareIE +from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE +from .teachable import TeachableIE +from .indavideo import IndavideoEmbedIE +from .apa import APAIE +from .foxnews import FoxNewsIE +from .viqeo import ViqeoIE +from .expressen import ExpressenIE +from .zype import ZypeIE +from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE + + +class GenericIE(InfoExtractor): + IE_DESC = 'Generic downloader that works on some sites' + _VALID_URL = r'.*' + IE_NAME = 'generic' + _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ], + 'skip': 'URL invalid', + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented', + r'400.*Bad Request', + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, + # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng + { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', + 'info_dict': { + 'id': 'smil', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'formats': 'mincount:16', + 'subtitles': 'mincount:1', + }, + 'params': { + 'force_generic_extractor': True, + 'skip_download': True, + }, + }, + # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html + { + 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', + 'info_dict': { + 'id': 'hds', + 'ext': 'flv', + 'title': 'hds', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from https://www.restudy.dk/video/play/id/1637 + { + 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', + 'info_dict': { + 'id': 'video_1637', + 'ext': 'flv', + 'title': 'video_1637', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm + { + 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', + 'info_dict': { + 'id': 'smil-service', + 'ext': 'flv', + 'title': 'smil-service', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 + { + 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html + { + 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', + 'info_dict': { + 'id': 'mZlp2ctYIUEB', + 'ext': 'mp4', + 'title': 'Tikibad ontruimd wegens brand', + 'description': 'md5:05ca046ff47b931f9b04855015e163a4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 33, + }, + 'params': { + 'skip_download': True, + }, + }, + # MPD from http://dash-mse-test.appspot.com/media.html + { + 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', + 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', + 'info_dict': { + 'id': 'car-20120827-manifest', + 'ext': 'mp4', + 'title': 'car-20120827-manifest', + 'formats': 'mincount:9', + 'upload_date': '20130904', + }, + 'params': { + 'format': 'bestvideo', + }, + }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': r're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, + { + 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', + 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', + 'info_dict': { + 'id': '13601338388002', + 'ext': 'mp4', + 'uploader': 'www.hodiho.fr', + 'title': 'R\u00e9gis plante sa Jeep', + } + }, + # bandcamp page with custom domain + { + 'add_ie': ['Bandcamp'], + 'url': 'http://bronyrock.com/track/the-pony-mash', + 'info_dict': { + 'id': '3235767654', + 'ext': 'mp3', + 'title': 'The Pony Mash', + 'uploader': 'M_Pallante', + }, + 'skip': 'There is a limit of 200 free downloads / month for the test song', + }, + { + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests + 'add_ie': ['BrightcoveLegacy'], + 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', + 'info_dict': { + 'id': '2765128793001', + 'ext': 'mp4', + 'title': 'Le cours de bourse : l’analyse technique', + 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', + 'uploader': 'BFM BUSINESS', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # https://github.com/ytdl-org/youtube-dl/issues/2253 + 'url': 'http://bcove.me/i6nfkrc3', + 'md5': '0ba9446db037002366bab3b3eb30c88c', + 'info_dict': { + 'id': '3101154703001', + 'ext': 'mp4', + 'title': 'Still no power', + 'uploader': 'thestar.com', + 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', + }, + 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', + }, + { + 'url': 'http://www.championat.com/video/football/v/87/87499.html', + 'md5': 'fb973ecf6e4a78a67453647444222983', + 'info_dict': { + 'id': '3414141473001', + 'ext': 'mp4', + 'title': 'Видео. Удаление Дзагоева (ЦСКА)', + 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', + 'uploader': 'Championat', + }, + }, + { + # https://github.com/ytdl-org/youtube-dl/issues/3541 + 'add_ie': ['BrightcoveLegacy'], + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, + { + # Brightcove video in ', webpage): + url = self._search_regex( + r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + + if not url: + url = self._og_search_url(webpage) + + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) + + player_id = mobj.group('player_id') + if not display_id: + display_id = player_id + if player_id: + player_page = self._download_webpage( + url, display_id, note='Downloading player page', + errnote='Could not download player page') + video_id = self._search_regex( + r'\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r']+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/peertube.py b/youtube_dlc/extractor/peertube.py new file mode 100644 index 0000000..48fb954 --- /dev/null +++ b/youtube_dlc/extractor/peertube.py @@ -0,0 +1,600 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + # Taken from https://instances.joinpeertube.org/instances + peertube\.rainbowswingers\.net| + tube\.stanisic\.nl| + peer\.suiri\.us| + medias\.libox\.fr| + videomensoif\.ynh\.fr| + peertube\.travelpandas\.eu| + peertube\.rachetjay\.fr| + peertube\.montecsys\.fr| + tube\.eskuero\.me| + peer\.tube| + peertube\.umeahackerspace\.se| + tube\.nx-pod\.de| + video\.monsieurbidouille\.fr| + tube\.openalgeria\.org| + vid\.lelux\.fi| + video\.anormallostpod\.ovh| + tube\.crapaud-fou\.org| + peertube\.stemy\.me| + lostpod\.space| + exode\.me| + peertube\.snargol\.com| + vis\.ion\.ovh| + videosdulib\.re| + v\.mbius\.io| + videos\.judrey\.eu| + peertube\.osureplayviewer\.xyz| + peertube\.mathieufamily\.ovh| + www\.videos-libr\.es| + fightforinfo\.com| + peertube\.fediverse\.ru| + peertube\.oiseauroch\.fr| + video\.nesven\.eu| + v\.bearvideo\.win| + video\.qoto\.org| + justporn\.cc| + video\.vny\.fr| + peervideo\.club| + tube\.taker\.fr| + peertube\.chantierlibre\.org| + tube\.ipfixe\.info| + tube\.kicou\.info| + tube\.dodsorf\.as| + videobit\.cc| + video\.yukari\.moe| + videos\.elbinario\.net| + hkvideo\.live| + pt\.tux\.tf| + www\.hkvideo\.live| + FIGHTFORINFO\.com| + pt\.765racing\.com| + peertube\.gnumeria\.eu\.org| + nordenmedia\.com| + peertube\.co\.uk| + tube\.darfweb\.eu| + tube\.kalah-france\.org| + 0ch\.in| + vod\.mochi\.academy| + film\.node9\.org| + peertube\.hatthieves\.es| + video\.fitchfamily\.org| + peertube\.ddns\.net| + video\.ifuncle\.kr| + video\.fdlibre\.eu| + tube\.22decembre\.eu| + peertube\.harmoniescreatives\.com| + tube\.fabrigli\.fr| + video\.thedwyers\.co| + video\.bruitbruit\.com| + peertube\.foxfam\.club| + peer\.philoxweb\.be| + videos\.bugs\.social| + peertube\.malbert\.xyz| + peertube\.bilange\.ca| + libretube\.net| + diytelevision\.com| + peertube\.fedilab\.app| + libre\.video| + video\.mstddntfdn\.online| + us\.tv| + peertube\.sl-network\.fr| + peertube\.dynlinux\.io| + peertube\.david\.durieux\.family| + peertube\.linuxrocks\.online| + peerwatch\.xyz| + v\.kretschmann\.social| + tube\.otter\.sh| + yt\.is\.nota\.live| + tube\.dragonpsi\.xyz| + peertube\.boneheadmedia\.com| + videos\.funkwhale\.audio| + watch\.44con\.com| + peertube\.gcaillaut\.fr| + peertube\.icu| + pony\.tube| + spacepub\.space| + tube\.stbr\.io| + v\.mom-gay\.faith| + tube\.port0\.xyz| + peertube\.simounet\.net| + play\.jergefelt\.se| + peertube\.zeteo\.me| + tube\.danq\.me| + peertube\.kerenon\.com| + tube\.fab-l3\.org| + tube\.calculate\.social| + peertube\.mckillop\.org| + tube\.netzspielplatz\.de| + vod\.ksite\.de| + peertube\.laas\.fr| + tube\.govital\.net| + peertube\.stephenson\.cc| + bistule\.nohost\.me| + peertube\.kajalinifi\.de| + video\.ploud\.jp| + video\.omniatv\.com| + peertube\.ffs2play\.fr| + peertube\.leboulaire\.ovh| + peertube\.tronic-studio\.com| + peertube\.public\.cat| + peertube\.metalbanana\.net| + video\.1000i100\.fr| + peertube\.alter-nativ-voll\.de| + tube\.pasa\.tf| + tube\.worldofhauru\.xyz| + pt\.kamp\.site| + peertube\.teleassist\.fr| + videos\.mleduc\.xyz| + conf\.tube| + media\.privacyinternational\.org| + pt\.forty-two\.nl| + video\.halle-leaks\.de| + video\.grosskopfgames\.de| + peertube\.schaeferit\.de| + peertube\.jackbot\.fr| + tube\.extinctionrebellion\.fr| + peertube\.f-si\.org| + video\.subak\.ovh| + videos\.koweb\.fr| + peertube\.zergy\.net| + peertube\.roflcopter\.fr| + peertube\.floss-marketing-school\.com| + vloggers\.social| + peertube\.iriseden\.eu| + videos\.ubuntu-paris\.org| + peertube\.mastodon\.host| + armstube\.com| + peertube\.s2s\.video| + peertube\.lol| + tube\.open-plug\.eu| + open\.tube| + peertube\.ch| + peertube\.normandie-libre\.fr| + peertube\.slat\.org| + video\.lacaveatonton\.ovh| + peertube\.uno| + peertube\.servebeer\.com| + peertube\.fedi\.quebec| + tube\.h3z\.jp| + tube\.plus200\.com| + peertube\.eric\.ovh| + tube\.metadocs\.cc| + tube\.unmondemeilleur\.eu| + gouttedeau\.space| + video\.antirep\.net| + nrop\.cant\.at| + tube\.ksl-bmx\.de| + tube\.plaf\.fr| + tube\.tchncs\.de| + video\.devinberg\.com| + hitchtube\.fr| + peertube\.kosebamse\.com| + yunopeertube\.myddns\.me| + peertube\.varney\.fr| + peertube\.anon-kenkai\.com| + tube\.maiti\.info| + tubee\.fr| + videos\.dinofly\.com| + toobnix\.org| + videotape\.me| + voca\.tube| + video\.heromuster\.com| + video\.lemediatv\.fr| + video\.up\.edu\.ph| + balafon\.video| + video\.ivel\.fr| + thickrips\.cloud| + pt\.laurentkruger\.fr| + video\.monarch-pass\.net| + peertube\.artica\.center| + video\.alternanet\.fr| + indymotion\.fr| + fanvid\.stopthatimp\.net| + video\.farci\.org| + v\.lesterpig\.com| + video\.okaris\.de| + tube\.pawelko\.net| + peertube\.mablr\.org| + tube\.fede\.re| + pytu\.be| + evertron\.tv| + devtube\.dev-wiki\.de| + raptube\.antipub\.org| + video\.selea\.se| + peertube\.mygaia\.org| + video\.oh14\.de| + peertube\.livingutopia\.org| + peertube\.the-penguin\.de| + tube\.thechangebook\.org| + tube\.anjara\.eu| + pt\.pube\.tk| + video\.samedi\.pm| + mplayer\.demouliere\.eu| + widemus\.de| + peertube\.me| + peertube\.zapashcanon\.fr| + video\.latavernedejohnjohn\.fr| + peertube\.pcservice46\.fr| + peertube\.mazzonetto\.eu| + video\.irem\.univ-paris-diderot\.fr| + video\.livecchi\.cloud| + alttube\.fr| + video\.coop\.tools| + video\.cabane-libre\.org| + peertube\.openstreetmap\.fr| + videos\.alolise\.org| + irrsinn\.video| + video\.antopie\.org| + scitech\.video| + tube2\.nemsia\.org| + video\.amic37\.fr| + peertube\.freeforge\.eu| + video\.arbitrarion\.com| + video\.datsemultimedia\.com| + stoptrackingus\.tv| + peertube\.ricostrongxxx\.com| + docker\.videos\.lecygnenoir\.info| + peertube\.togart\.de| + tube\.postblue\.info| + videos\.domainepublic\.net| + peertube\.cyber-tribal\.com| + video\.gresille\.org| + peertube\.dsmouse\.net| + cinema\.yunohost\.support| + tube\.theocevaer\.fr| + repro\.video| + tube\.4aem\.com| + quaziinc\.com| + peertube\.metawurst\.space| + videos\.wakapo\.com| + video\.ploud\.fr| + video\.freeradical\.zone| + tube\.valinor\.fr| + refuznik\.video| + pt\.kircheneuenburg\.de| + peertube\.asrun\.eu| + peertube\.lagob\.fr| + videos\.side-ways\.net| + 91video\.online| + video\.valme\.io| + video\.taboulisme\.com| + videos-libr\.es| + tv\.mooh\.fr| + nuage\.acostey\.fr| + video\.monsieur-a\.fr| + peertube\.librelois\.fr| + videos\.pair2jeux\.tube| + videos\.pueseso\.club| + peer\.mathdacloud\.ovh| + media\.assassinate-you\.net| + vidcommons\.org| + ptube\.rousset\.nom\.fr| + tube\.cyano\.at| + videos\.squat\.net| + video\.iphodase\.fr| + peertube\.makotoworkshop\.org| + peertube\.serveur\.slv-valbonne\.fr| + vault\.mle\.party| + hostyour\.tv| + videos\.hack2g2\.fr| + libre\.tube| + pire\.artisanlogiciel\.net| + videos\.numerique-en-commun\.fr| + video\.netsyms\.com| + video\.die-partei\.social| + video\.writeas\.org| + peertube\.swarm\.solvingmaz\.es| + tube\.pericoloso\.ovh| + watching\.cypherpunk\.observer| + videos\.adhocmusic\.com| + tube\.rfc1149\.net| + peertube\.librelabucm\.org| + videos\.numericoop\.fr| + peertube\.koehn\.com| + peertube\.anarchmusicall\.net| + tube\.kampftoast\.de| + vid\.y-y\.li| + peertube\.xtenz\.xyz| + diode\.zone| + tube\.egf\.mn| + peertube\.nomagic\.uk| + visionon\.tv| + videos\.koumoul\.com| + video\.rastapuls\.com| + video\.mantlepro\.com| + video\.deadsuperhero\.com| + peertube\.musicstudio\.pro| + peertube\.we-keys\.fr| + artitube\.artifaille\.fr| + peertube\.ethernia\.net| + tube\.midov\.pl| + peertube\.fr| + watch\.snoot\.tube| + peertube\.donnadieu\.fr| + argos\.aquilenet\.fr| + tube\.nemsia\.org| + tube\.bruniau\.net| + videos\.darckoune\.moe| + tube\.traydent\.info| + dev\.videos\.lecygnenoir\.info| + peertube\.nayya\.org| + peertube\.live| + peertube\.mofgao\.space| + video\.lequerrec\.eu| + peertube\.amicale\.net| + aperi\.tube| + tube\.ac-lyon\.fr| + video\.lw1\.at| + www\.yiny\.org| + videos\.pofilo\.fr| + tube\.lou\.lt| + choob\.h\.etbus\.ch| + tube\.hoga\.fr| + peertube\.heberge\.fr| + video\.obermui\.de| + videos\.cloudfrancois\.fr| + betamax\.video| + video\.typica\.us| + tube\.piweb\.be| + video\.blender\.org| + peertube\.cat| + tube\.kdy\.ch| + pe\.ertu\.be| + peertube\.social| + videos\.lescommuns\.org| + tv\.datamol\.org| + videonaute\.fr| + dialup\.express| + peertube\.nogafa\.org| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.bootlicker\.party| + skeptikon\.fr| + video\.blueline\.mg| + tube\.homecomputing\.fr| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + tube\.kher\.nl| + peertube\.qtg\.fr| + video\.migennes\.net| + tube\.p2p\.legal| + troll\.tv| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.cemea\.org| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + sikke\.fi| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + videos\.benpro\.fr| + peertube\.parleur\.net| + peertube\.heraut\.eu| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + tube\.mochi\.academy| + media\.zat\.im| + video\.colibris-outilslibres\.org| + tube\.svnet\.fr| + peertube\.video| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + _API_BASE = 'https://%s/api/v1/videos/%s/%s' + _VALID_URL = r'''(?x) + (?: + peertube:(?P[^:]+):| + https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'md5': '9bed8c0137913e17b86334e5885aacff', + 'info_dict': { + 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', + 'ext': 'mp4', + 'title': 'What is PeerTube?', + 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1538391166, + 'upload_date': '20181001', + 'uploader': 'Framasoft', + 'uploader_id': '3', + 'uploader_url': 'https://framatube.org/accounts/framasoft', + 'channel': 'Les vidéos de Framasoft', + 'channel_id': '2', + 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'language': 'en', + 'license': 'Attribution - Share Alike', + 'duration': 113, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['framasoft', 'peertube'], + 'categories': ['Science & Technology'], + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, + }, { + 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'only_matching': True, + }] + + @staticmethod + def _extract_peertube_url(webpage, source_url): + mobj = re.match( + r'https?://(?P[^/]+)/videos/(?:watch|embed)/(?P%s)' + % PeerTubeIE._UUID_RE, source_url) + if mobj and any(p in webpage for p in ( + 'PeerTube<', + 'There will be other non JS-based clients to access PeerTube', + '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): + return 'peertube:%s:%s' % mobj.group('host', 'id') + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' + % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) + if not entries: + peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) + if peertube_url: + entries = [peertube_url] + return entries + + def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, video_id, path), video_id, + note=note, errnote=errnote, fatal=fatal) + + def _get_subtitles(self, host, video_id): + captions = self._call_api( + host, video_id, 'captions', note='Downloading captions JSON', + fatal=False) + if not isinstance(captions, dict): + return + data = captions.get('data') + if not isinstance(data, list): + return + subtitles = {} + for e in data: + language_id = try_get(e, lambda x: x['language']['id'], compat_str) + caption_url = urljoin('https://%s' % host, e.get('captionPath')) + if not caption_url: + continue + subtitles.setdefault(language_id or 'en', []).append({ + 'url': caption_url, + }) + return subtitles + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or mobj.group('host_2') + video_id = mobj.group('id') + + video = self._call_api( + host, video_id, '', note='Downloading video JSON') + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + full_description = self._call_api( + host, video_id, 'description', note='Downloading description JSON', + fatal=False) + + description = None + if isinstance(full_description, dict): + description = str_or_none(full_description.get('description')) + if not description: + description = video.get('description') + + subtitles = self.extract_subtitles(host, video_id) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + 'subtitles': subtitles + } diff --git a/youtube_dlc/extractor/people.py b/youtube_dlc/extractor/people.py new file mode 100644 index 0000000..6ca9571 --- /dev/null +++ b/youtube_dlc/extractor/people.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class PeopleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' + + _TEST = { + 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', + 'info_dict': { + 'id': 'ref:20995451', + 'ext': 'mp4', + 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', + 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 246.318, + 'timestamp': 1458720585, + 'upload_date': '20160323', + 'uploader_id': '416418724', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + + def _real_extract(self, url): + return self.url_result( + 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' + % self._match_id(url), 'BrightcoveNew') diff --git a/youtube_dlc/extractor/performgroup.py b/youtube_dlc/extractor/performgroup.py new file mode 100644 index 0000000..26942bf --- /dev/null +++ b/youtube_dlc/extractor/performgroup.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PerformGroupIE(InfoExtractor): + _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' + _TESTS = [{ + # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html + 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', + 'md5': '259cb03d142e2e52471e8837ecacb29f', + 'info_dict': { + 'id': 'xgrwobuzumes1lwjxtcdpwgxd', + 'ext': 'mp4', + 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', + 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', + 'timestamp': 1511533477, + 'upload_date': '20171124', + } + }] + + def _call_api(self, service, auth_token, content_id, referer_url): + return self._download_json( + 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + content_id, headers={ + 'Referer': referer_url, + 'Origin': 'http://player.performgroup.com', + }, query={ + '_fmt': 'json', + }) + + def _real_extract(self, url): + player_id, auth_token = re.search(self._VALID_URL, url).groups() + bootstrap = self._call_api('bootstrap', auth_token, player_id, url) + video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] + video_id = video['uuid'] + vod = self._call_api('vod', auth_token, video_id, url) + media = vod['videos']['video'][0]['media'] + + formats = [] + hls_url = media.get('hls', {}).get('url') + if hls_url: + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + hds_url = media.get('hds', {}).get('url') + if hds_url: + formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) + + for c in media.get('content', []): + c_url = c.get('url') + if not c_url: + continue + tbr = int_or_none(c.get('bitrate'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % tbr + formats.append({ + 'format_id': format_id, + 'url': c_url, + 'tbr': tbr, + 'width': int_or_none(c.get('width')), + 'height': int_or_none(c.get('height')), + 'filesize': int_or_none(c.get('fileSize')), + 'vcodec': c.get('type'), + 'fps': int_or_none(c.get('videoFrameRate')), + 'vbr': int_or_none(c.get('videoRate'), 1000), + 'abr': int_or_none(c.get('audioRate'), 1000), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video['title'], + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': int_or_none(video.get('publishedTime'), 1000), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/periscope.py b/youtube_dlc/extractor/periscope.py new file mode 100644 index 0000000..b159063 --- /dev/null +++ b/youtube_dlc/extractor/periscope.py @@ -0,0 +1,189 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + unescapeHTML, +) + + +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast.get('status') or 'Periscope Broadcast' + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + + +class PeriscopeIE(PeriscopeBaseIE): + IE_DESC = 'Periscope' + IE_NAME = 'periscope' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' + # Alive example URLs can be found here https://www.periscope.tv/ + _TESTS = [{ + 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', + 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'info_dict': { + 'id': '56102209', + 'ext': 'mp4', + 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', + 'timestamp': 1438978559, + 'upload_date': '20150807', + 'uploader': 'Bec Boop', + 'uploader_id': '1465763', + }, + 'skip': 'Expires in 24 hours', + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', + 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + token = self._match_id(url) + + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) + + broadcast = stream['broadcast'] + info = self._parse_broadcast_data(broadcast, token) + + state = broadcast.get('state').lower() + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + video_urls = set() + formats = [] + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): + video_url = stream.get(format_id + '_url') + if not video_url or video_url in video_urls: + continue + video_urls.add(video_url) + if format_id != 'rtmp': + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) + formats.extend(m3u8_formats) + continue + rtmp_format = { + 'url': video_url, + 'ext': 'flv' if format_id == 'rtmp' else 'mp4', + } + self._add_width_and_height(rtmp_format) + formats.append(rtmp_format) + self._sort_formats(formats) + + info['formats'] = formats + return info + + +class PeriscopeUserIE(PeriscopeBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_name = self._match_id(url) + + webpage = self._download_webpage(url, user_name) + + data_store = self._parse_json( + unescapeHTML(self._search_regex( + r'data-store=(["\'])(?P<data>.+?)\1', + webpage, 'data store', default='{}', group='data')), + user_name) + + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name + description = user.get('description') + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) + for broadcast_id in broadcast_ids] + + return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dlc/extractor/philharmoniedeparis.py b/youtube_dlc/extractor/philharmoniedeparis.py new file mode 100644 index 0000000..03da64b --- /dev/null +++ b/youtube_dlc/extractor/philharmoniedeparis.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + urljoin, +) + + +class PhilharmonieDeParisIE(InfoExtractor): + IE_DESC = 'Philharmonie de Paris' + _VALID_URL = r'''(?x) + https?:// + (?: + live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| + pad\.philharmoniedeparis\.fr/doc/CIMU/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', + 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'info_dict': { + 'id': '1086697', + 'ext': 'mp4', + 'title': 'Jazz à la Villette : Knower', + }, + }, { + 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', + 'info_dict': { + 'id': '1032066', + 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', + 'only_matching': True, + }, { + 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }, { + 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'only_matching': True, + }] + _LIVE_URL = 'https://live.philharmoniedeparis.fr' + + def _real_extract(self, url): + video_id = self._match_id(url) + + config = self._download_json( + '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'id': video_id, + 'lang': 'fr-FR', + }) + + def extract_entry(source): + if not isinstance(source, dict): + return + title = source.get('title') + if not title: + return + files = source.get('files') + if not isinstance(files, dict): + return + format_urls = set() + formats = [] + for format_id in ('mobile', 'desktop'): + format_url = try_get( + files, lambda x: x[format_id]['file'], compat_str) + if not format_url or format_url in format_urls: + continue + format_urls.add(format_url) + m3u8_url = urljoin(self._LIVE_URL, format_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + if not formats: + return + self._sort_formats(formats) + return { + 'title': title, + 'formats': formats, + } + + thumbnail = urljoin(self._LIVE_URL, config.get('image')) + + info = extract_entry(config) + if info: + info.update({ + 'id': video_id, + 'thumbnail': thumbnail, + }) + return info + + entries = [] + for num, chapter in enumerate(config['chapters'], start=1): + entry = extract_entry(chapter) + entry['id'] = '%s-%d' % (video_id, num) + entries.append(entry) + + return self.playlist_result(entries, video_id, config.get('title')) diff --git a/youtube_dlc/extractor/phoenix.py b/youtube_dlc/extractor/phoenix.py new file mode 100644 index 0000000..8d52ad3 --- /dev/null +++ b/youtube_dlc/extractor/phoenix.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class PhoenixIE(InfoExtractor): + IE_NAME = 'phoenix.de' + _VALID_URL = r'''https?://(?:www\.)?phoenix.de/\D+(?P<id>\d+)\.html''' + _TESTS = [ + { + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/unsere-welt-in-zukunft---stadt-a-1283620.html', + 'md5': '5e765e838aa3531c745a4f5b249ee3e3', + 'info_dict': { + 'id': '0OB4HFc43Ns', + 'ext': 'mp4', + 'title': 'Unsere Welt in Zukunft - Stadt', + 'description': 'md5:9bfb6fd498814538f953b2dcad7ce044', + 'upload_date': '20190912', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', + } + }, + { + 'url': 'https://www.phoenix.de/drohnenangriffe-in-saudi-arabien-a-1286995.html?ref=aktuelles', + 'only_matching': True, + }, + # an older page: https://www.phoenix.de/sendungen/gespraeche/phoenix-persoenlich/im-dialog-a-177727.html + # seems to not have an embedded video, even though it's uploaded on youtube: https://www.youtube.com/watch?v=4GxnoUHvOkM + ] + + def extract_from_json_api(self, video_id, api_url): + doc = self._download_json( + api_url, video_id, + note="Downloading webpage metadata", + errnote="Failed to load webpage metadata") + + for a in doc["absaetze"]: + if a["typ"] == "video-youtube": + return { + '_type': 'url_transparent', + 'id': a["id"], + 'title': doc["titel"], + 'url': "https://www.youtube.com/watch?v=%s" % a["id"], + 'ie_key': 'Youtube', + } + raise ExtractorError("No downloadable video found", expected=True) + + def _real_extract(self, url): + page_id = self._match_id(url) + api_url = 'https://www.phoenix.de/response/id/%s' % page_id + return self.extract_from_json_api(page_id, api_url) diff --git a/youtube_dlc/extractor/photobucket.py b/youtube_dlc/extractor/photobucket.py new file mode 100644 index 0000000..6c8bbe1 --- /dev/null +++ b/youtube_dlc/extractor/photobucket.py @@ -0,0 +1,46 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class PhotobucketIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _TEST = { + 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', + 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', + 'info_dict': { + 'id': 'zpsc0c3b9fa', + 'ext': 'mp4', + 'timestamp': 1367669341, + 'upload_date': '20130504', + 'uploader': 'rachaneronas', + 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + video_extension = mobj.group('ext') + + webpage = self._download_webpage(url, video_id) + + # Extract URL, uploader, and title from webpage + self.report_extraction(video_id) + info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', + webpage, 'info json') + info = json.loads(info_json) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + return { + 'id': video_id, + 'url': url, + 'uploader': info['username'], + 'timestamp': info['creationDate'], + 'title': info['title'], + 'ext': video_extension, + 'thumbnail': info['thumbUrl'], + } diff --git a/youtube_dlc/extractor/picarto.py b/youtube_dlc/extractor/picarto.py new file mode 100644 index 0000000..8099ef1 --- /dev/null +++ b/youtube_dlc/extractor/picarto.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PicartoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?' + _TEST = { + 'url': 'https://picarto.tv/Setz', + 'info_dict': { + 'id': 'Setz', + 'ext': 'mp4', + 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'timestamp': int, + 'is_live': True + }, + 'skip': 'Stream is offline', + } + + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + + metadata = self._download_json( + 'https://api.picarto.tv/v1/channel/name/' + channel_id, + channel_id) + + if metadata.get('online') is False: + raise ExtractorError('Stream is offline', expected=True) + + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, + data=urlencode_postdata({'loadbalancinginfo': channel_id}), + note='Downloading load balancing info') + + token = mobj.group('token') or 'public' + params = { + 'con': int(time.time() * 1000), + 'token': token, + } + + prefered_edge = cdn_data.get('preferedEdge') + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue + self._sort_formats(formats) + + mature = metadata.get('adult') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + + return { + 'id': channel_id, + 'title': self._live_title(metadata.get('title') or channel_id), + 'is_live': True, + 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), + 'channel': channel_id, + 'channel_url': 'https://picarto.tv/%s' % channel_id, + 'age_limit': age_limit, + 'formats': formats, + } + + +class PicartoVodIE(InfoExtractor): + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', + 'info_dict': { + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', + 'ext': 'mp4', + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/piksel.py b/youtube_dlc/extractor/piksel.py new file mode 100644 index 0000000..88b6859 --- /dev/null +++ b/youtube_dlc/extractor/piksel.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + dict_get, + int_or_none, + unescapeHTML, + parse_iso8601, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P<id>[a-z0-9_]+)' + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/ums2867l', + 'md5': '34e34c8d89dc2559976a6079db531e85', + 'info_dict': { + 'id': 'ums2867l', + 'ext': 'mp4', + 'title': 'GX-005 with Caption', + 'timestamp': 1481335659, + 'upload_date': '20161210' + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204' + } + }, + { + # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ + 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-de-program-uuid=[\'"]([a-z0-9]+)', + webpage, 'program uuid', default=display_id) + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') + response = self._download_json( + 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, + video_id, query={ + 'v': video_id + })['response'] + failure = response.get('failure') + if failure: + raise ExtractorError(response['failure']['reason'], expected=True) + video_data = response['WsProgramResponse']['program']['asset'] + title = video_data['title'] + + formats = [] + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + for asset_file in video_data.get('assetFiles', []): + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + continue + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + if caption_url: + subtitles.setdefault(caption.get('locale', 'en'), []).append({ + 'url': caption_url}) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dlc/extractor/pinkbike.py b/youtube_dlc/extractor/pinkbike.py new file mode 100644 index 0000000..9f3501f --- /dev/null +++ b/youtube_dlc/extractor/pinkbike.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start, + str_to_int, + unified_strdate, +) + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 100, + 'upload_date': '20150406', + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) + + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + uploader = self._search_regex( + r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage, + 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) + + location = self._html_search_regex( + r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', + webpage, 'location', fatal=False) + + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) + + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats + } diff --git a/youtube_dlc/extractor/pladform.py b/youtube_dlc/extractor/pladform.py new file mode 100644 index 0000000..e86c653 --- /dev/null +++ b/youtube_dlc/extractor/pladform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + xpath_text, + qualities, +) + + +class PladformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + out\.pladform\.ru/player| + static\.pladform\.ru/player\.swf + ) + \?.*\bvideoid=| + video\.pladform\.ru/catalog/video/videoid/ + ) + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', + 'info_dict': { + 'id': '3777899', + 'ext': 'mp4', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3190, + }, + }, { + 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', + 'only_matching': True, + }, { + 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + + video = self._download_xml( + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) + + if video.tag == 'error': + fail(video.text) + + quality = qualities(('ld', 'sd', 'hd')) + + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + + self._sort_formats(formats) + + webpage = self._download_webpage( + 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + video_id) + + title = self._og_search_title(webpage, fatal=False) or xpath_text( + video, './/title', 'title', fatal=True) + description = self._search_regex( + r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) or xpath_text( + video, './/cover', 'cover') + + duration = int_or_none(xpath_text(video, './/time', 'duration')) + age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/platzi.py b/youtube_dlc/extractor/platzi.py new file mode 100644 index 0000000..23c8256 --- /dev/null +++ b/youtube_dlc/extractor/platzi.py @@ -0,0 +1,224 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class PlatziBaseIE(InfoExtractor): + _LOGIN_URL = 'https://platzi.com/login/' + _NETRC_MACHINE = 'platzi' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username, + 'password': password, + }) + + urlh = self._request_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + # login succeeded + if 'platzi.com/login' not in urlh.geturl(): + return + + login_error = self._webpage_read_content( + urlh, self._LOGIN_URL, None, 'Downloading login error page') + + login = self._parse_json( + self._search_regex( + r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), + None) + + for kind in ('error', 'password', 'nonFields'): + error = str_or_none(login.get('%sError' % kind)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class PlatziIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P<id>\d+)-[^/?\#&]+ + ''' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + data = self._parse_json( + self._search_regex( + # client_data may contain "};" so that we have to try more + # strict regex first + (r'client_data\s*=\s*({.+?})\s*;\s*\n', + r'client_data\s*=\s*({.+?})\s*;'), + webpage, 'client data'), + lecture_id) + + material = data['initialState']['material'] + desc = material['description'] + title = desc['title'] + + formats = [] + for server_id, server in material['videos'].items(): + if not isinstance(server, dict): + continue + for format_id in ('hls', 'dash'): + format_url = url_or_none(server.get(format_id)) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, lecture_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % server_id, + fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, lecture_id, mpd_id=format_id, + note='Downloading %s MPD manifest' % server_id, + fatal=False)) + self._sort_formats(formats) + + content = str_or_none(desc.get('content')) + description = (clean_html(compat_b64decode(content).decode('utf-8')) + if content else None) + duration = int_or_none(material.get('duration'), invscale=60) + + return { + 'id': lecture_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + } + + +class PlatziCourseIE(PlatziBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/(?P<id>[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://platzi.com/clases/next-js/', + 'info_dict': { + 'id': '1311', + 'title': 'Curso de Next.js', + }, + 'playlist_count': 22, + }, { + 'url': 'https://courses.platzi.com/classes/communication-codestream/', + 'info_dict': { + 'id': '1367', + 'title': 'Codestream Course', + }, + 'playlist_count': 14, + }] + + @classmethod + def suitable(cls, url): + return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + webpage = self._download_webpage(url, course_name) + + props = self._parse_json( + self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), + course_name)['initialProps'] + + entries = [] + for chapter_num, chapter in enumerate(props['concepts'], 1): + if not isinstance(chapter, dict): + continue + materials = chapter.get('materials') + if not materials or not isinstance(materials, list): + continue + chapter_title = chapter.get('title') + chapter_id = str_or_none(chapter.get('id')) + for material in materials: + if not isinstance(material, dict): + continue + if material.get('material_type') != 'video': + continue + video_url = urljoin(url, material.get('url')) + if not video_url: + continue + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'title': str_or_none(material.get('name')), + 'id': str_or_none(material.get('id')), + 'ie_key': PlatziIE.ie_key(), + 'chapter': chapter_title, + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + }) + + course_id = compat_str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], compat_str) + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dlc/extractor/playfm.py b/youtube_dlc/extractor/playfm.py new file mode 100644 index 0000000..e766ccc --- /dev/null +++ b/youtube_dlc/extractor/playfm.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class PlayFMIE(InfoExtractor): + IE_NAME = 'play.fm' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' + + _TEST = { + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', + 'md5': 'c505f8307825a245d0c7ad1850001f22', + 'info_dict': { + 'id': '71276', + 'ext': 'mp3', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', + 'view_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + slug = mobj.group('slug') + + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) + + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) + + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + } diff --git a/youtube_dlc/extractor/playplustv.py b/youtube_dlc/extractor/playplustv.py new file mode 100644 index 0000000..1e30ab2 --- /dev/null +++ b/youtube_dlc/extractor/playplustv.py @@ -0,0 +1,109 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + PUTRequest, +) + + +class PlayPlusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' + _TEST = { + 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', + 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', + 'info_dict': { + 'id': 'db8d274a5163424e967f35a30ddafb8e', + 'ext': 'mp4', + 'title': 'Capítulo 179 - Final', + 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', + 'timestamp': 1529992740, + 'upload_date': '20180626', + }, + 'skip': 'Requires account credential', + } + _NETRC_MACHINE = 'playplustv' + _GEO_COUNTRIES = ['BR'] + _token = None + _profile_id = None + + def _call_api(self, resource, video_id=None, query=None): + return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ + 'Authorization': 'Bearer ' + self._token, + }, query=query) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + self.raise_login_required() + + req = PUTRequest( + 'https://api.playplus.tv/api/web/login', json.dumps({ + 'email': email, + 'password': password, + }).encode(), { + 'Content-Type': 'application/json; charset=utf-8', + }) + + try: + self._token = self._download_json(req, None)['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(self._parse_json( + e.cause.read(), None)['errorMessage'], expected=True) + raise + + self._profile = self._call_api('Profiles')['list'][0]['_id'] + + def _real_extract(self, url): + project_id, media_id = re.match(self._VALID_URL, url).groups() + media = self._call_api( + 'Media', media_id, { + 'profileId': self._profile, + 'projectId': project_id, + 'mediaId': media_id, + })['obj'] + title = media['title'] + + formats = [] + for f in media.get('files', []): + f_url = f.get('url') + if not f_url: + continue + file_info = f.get('fileInfo') or {} + formats.append({ + 'url': f_url, + 'width': int_or_none(file_info.get('width')), + 'height': int_or_none(file_info.get('height')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumb in media.get('thumbs', []): + thumb_url = thumb.get('url') + if not thumb_url: + continue + thumbnails.append({ + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + }) + + return { + 'id': media_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clean_html(media.get('description')) or media.get('shortDescription'), + 'timestamp': int_or_none(media.get('publishDate'), 1000), + 'view_count': int_or_none(media.get('numberOfViews')), + 'comment_count': int_or_none(media.get('numberOfComments')), + 'tags': media.get('tags'), + } diff --git a/youtube_dlc/extractor/plays.py b/youtube_dlc/extractor/plays.py new file mode 100644 index 0000000..ddfc6f1 --- /dev/null +++ b/youtube_dlc/extractor/plays.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PlaysTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' + _TESTS = [{ + 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', + 'md5': 'dfeac1198506652b5257a62762cec7bc', + 'info_dict': { + 'id': '56af17f56c95335490', + 'ext': 'mp4', + 'title': 'Bjergsen - When you outplay the Azir wall', + 'description': 'Posted by Bjergsen', + } + }, { + 'url': 'https://plays.tv/embeds/56af17f56c95335490', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://plays.tv/video/%s' % video_id, video_id) + + info = self._search_json_ld(webpage, video_id,) + + mpd_url, sources = re.search( + r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', + webpage).groups() + formats = self._extract_mpd_formats( + self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') + for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): + formats.append({ + 'url': self._proto_relative_url(format_url), + 'format_id': 'http-' + format_id, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'description': self._og_search_description(webpage), + 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + }) + + return info diff --git a/youtube_dlc/extractor/playtvak.py b/youtube_dlc/extractor/playtvak.py new file mode 100644 index 0000000..4c5f579 --- /dev/null +++ b/youtube_dlc/extractor/playtvak.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) + + +class PlaytvakIE(InfoExtractor): + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'description': 'md5:4436e61b7df227a093778efb7e373571', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, + } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info_url = self._html_search_regex( + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + item = None + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': + item = i + break + if not item: + raise ExtractorError('No suitable stream found') + + quality = qualities(('low', 'middle', 'high')) + + formats = [] + for fmt in item['video']: + video_url = fmt.get('file') + if not video_url: + continue + + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ('mp4', 'webm'): + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + preference = -1 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue + else: # Other formats not supported yet + continue + + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) + self._sort_formats(formats) + + title = item['title'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description', default=None) + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/playvid.py b/youtube_dlc/extractor/playvid.py new file mode 100644 index 0000000..4aef186 --- /dev/null +++ b/youtube_dlc/extractor/playvid.py @@ -0,0 +1,99 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, +) +from ..utils import ( + clean_html, + ExtractorError, +) + + +class PlayvidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' + _TESTS = [{ + 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', + 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', + 'info_dict': { + 'id': 'RnmBNgtrrJu', + 'ext': 'mp4', + 'title': 'md5:9256d01c6317e3f703848b5906880dc8', + 'duration': 82, + 'age_limit': 18, + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m_error = re.search( + r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage) + if m_error: + raise ExtractorError(clean_html(m_error.group('msg')), expected=True) + + video_title = None + duration = None + video_thumbnail = None + formats = [] + + # most of the information is stored in the flashvars + flashvars = self._html_search_regex( + r'flashvars="(.+?)"', webpage, 'flashvars') + + infos = compat_urllib_parse_unquote(flashvars).split(r'&') + for info in infos: + videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) + if videovars_match: + key = videovars_match.group(1) + val = videovars_match.group(2) + + if key == 'title': + video_title = compat_urllib_parse_unquote_plus(val) + if key == 'duration': + try: + duration = int(val) + except ValueError: + pass + if key == 'big_thumb': + video_thumbnail = val + + videourl_match = re.match( + r'^video_urls\]\[(?P<resolution>[0-9]+)p', key) + if videourl_match: + height = int(videourl_match.group('resolution')) + formats.append({ + 'height': height, + 'url': val, + }) + self._sort_formats(formats) + + # Extract title - should be in the flashvars; if not, look elsewhere + if video_title is None: + video_title = self._html_search_regex( + r'<title>(.*?)</title', webpage, 'title') + + return { + 'id': video_id, + 'formats': formats, + 'title': video_title, + 'thumbnail': video_thumbnail, + 'duration': duration, + 'description': None, + 'age_limit': 18 + } diff --git a/youtube_dlc/extractor/playwire.py b/youtube_dlc/extractor/playwire.py new file mode 100644 index 0000000..4d96a10 --- /dev/null +++ b/youtube_dlc/extractor/playwire.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + dict_get, + float_or_none, +) + + +class PlaywireIE(InfoExtractor): + _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', + 'md5': 'e6398701e3595888125729eaa2329ed9', + 'info_dict': { + 'id': '3353705', + 'ext': 'mp4', + 'title': 'S04_RM_UCL_Rus', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 145.94, + }, + }, { + # m3u8 in f4m + 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', + 'info_dict': { + 'id': '4840492', + 'ext': 'mp4', + 'title': 'ITV EL SHOW FULL', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # Multiple resolutions while bitrates missing + 'url': 'http://cdn.playwire.com/11625/embed/85228.html', + 'only_matching': True, + }, { + 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', + 'only_matching': True, + }, { + 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') + + player = self._download_json( + 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + video_id) + + title = player['settings']['title'] + duration = float_or_none(player.get('duration'), 1000) + + content = player['content'] + thumbnail = content.get('poster') + src = content['media']['f4m'] + + formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') + for a_format in formats: + if not dict_get(a_format, ['tbr', 'width', 'height']): + a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/pluralsight.py b/youtube_dlc/extractor/pluralsight.py new file mode 100644 index 0000000..abd08bc --- /dev/null +++ b/youtube_dlc/extractor/pluralsight.py @@ -0,0 +1,501 @@ +from __future__ import unicode_literals + +import collections +import json +import os +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + qualities, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, +) + + +class PluralsightBaseIE(InfoExtractor): + _API_BASE = 'https://app.pluralsight.com' + + _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_HEADERS = { + 'Content-Type': 'application/json;charset=UTF-8', + } + _GRAPHQL_COURSE_TMPL = ''' +query BootstrapPlayer { + rpc { + bootstrapPlayer { + profile { + firstName + lastName + email + username + userHandle + authed + isAuthed + plan + } + course(courseId: "%s") { + name + title + courseHasCaptions + translationLanguages { + code + name + } + supportsWideScreenVideoFormats + timestamp + modules { + name + title + duration + formattedDuration + author + authorized + clips { + authorized + clipId + duration + formattedDuration + id + index + moduleIndex + moduleTitle + name + title + watched + } + } + } + } + } +}''' + + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + self._GRAPHQL_EP, display_id, data=json.dumps({ + 'query': self._GRAPHQL_COURSE_TMPL % course_id, + 'variables': {} + }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + + course = try_get( + response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], + dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + + +class PluralsightIE(PluralsightBaseIE): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' + _LOGIN_URL = 'https://app.pluralsight.com/id/' + + _NETRC_MACHINE = 'pluralsight' + + _TESTS = [{ + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + }, { + 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', + 'only_matching': True, + }, { + # available without pluralsight account + 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', + 'only_matching': True, + }] + + GRAPHQL_VIEWCLIP_TMPL = ''' +query viewClip { + viewClip(input: { + author: "%(author)s", + clipIndex: %(clipIndex)d, + courseName: "%(courseName)s", + includeCaptions: %(includeCaptions)s, + locale: "%(locale)s", + mediaType: "%(mediaType)s", + moduleName: "%(moduleName)s", + quality: "%(quality)s" + }) { + urls { + url + cdn + rank + source + }, + status + } +}''' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username, + 'Password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error = self._search_regex( + r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + if all(not re.search(p, response) for p in ( + r'__INITIAL_STATE__', r'["\']currentUser["\']', + # new layout? + r'>\s*Sign out\s*<')): + BLOCKED = 'Your account has been blocked due to suspicious activity' + if BLOCKED in response: + raise ExtractorError( + 'Unable to login: %s' % BLOCKED, expected=True) + MUST_AGREE = 'To continue using Pluralsight, you must agree to' + if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): + raise ExtractorError( + 'Unable to login: %s some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.' + % MUST_AGREE, expected=True) + + raise ExtractorError('Unable to log in') + + def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): + captions = None + if clip_id: + captions = self._download_json( + '%s/transcript/api/v1/caption/json/%s/%s' + % (self._API_BASE, clip_id, lang), video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False) + if not captions: + captions_post = { + 'a': author, + 'cn': int(clip_idx), + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/player/retrieve-captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') + TEXT_KEYS = ('text', 'Text') + for num, current in enumerate(subs): + current = subs[num] + start, text = ( + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), + dict_get(current, TEXT_KEYS)) + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + + def _real_extract(self, url): + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + author = qs.get('author', [None])[0] + name = qs.get('name', [None])[0] + clip_idx = qs.get('clip', [None])[0] + course_name = qs.get('course', [None])[0] + + if any(not f for f in (author, name, clip_idx, course_name,)): + raise ExtractorError('Invalid URL', expected=True) + + display_id = '%s-%s' % (name, clip_idx) + + course = self._download_course(course_name, url, display_id) + + collection = course['modules'] + + clip = None + + for module_ in collection: + if name in (module_.get('moduleName'), module_.get('name')): + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + clip_index = clip_.get('index') + if clip_index is None: + continue + if compat_str(clip_index) == clip_idx: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + title = clip['title'] + clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, + } + + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + + ALLOWED_QUALITIES = ( + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), + ) + + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/ytdl-org/youtube-dl/issues/7766) + widescreen = course.get('supportsWideScreenVideoFormats') is True + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + + # In order to minimize the number of calls to ViewClip API and reduce + # the probability of being throttled or banned by Pluralsight we will request + # only single format until formats listing was explicitly requested. + if self._downloader.params.get('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self._downloader.params.get('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, (best_quality, )), ) + allowed_qualities = guess_allowed_qualities() + + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + clip_post = { + 'author': author, + 'includeCaptions': 'false', + 'clipIndex': int(clip_idx), + 'courseName': course_name, + 'locale': 'en', + 'moduleName': name, + 'mediaType': ext, + 'quality': '%dx%d' % (f['width'], f['height']), + } + format_id = '%s-%s' % (ext, quality) + + try: + viewclip = self._download_json( + self._GRAPHQL_EP, display_id, + 'Downloading %s viewclip graphql' % format_id, + data=json.dumps({ + 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, + 'variables': {} + }).encode('utf-8'), + headers=self._GRAPHQL_HEADERS)['data']['viewClip'] + except ExtractorError: + # Still works but most likely will go soon + viewclip = self._download_json( + '%s/video/clips/viewclip' % self._API_BASE, display_id, + 'Downloading %s viewclip JSON' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + + # Pluralsight tracks multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see + # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead + # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). + # To somewhat reduce the probability of these consequences + # we will sleep random amount of time before each call to ViewClip. + self._sleep( + random.randint(2, 5), display_id, + '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + + if not viewclip: + continue + + clip_urls = viewclip.get('urls') + if not isinstance(clip_urls, list): + continue + + for clip_url_data in clip_urls: + clip_url = clip_url_data.get('url') + if not clip_url: + continue + cdn = clip_url_data.get('cdn') + clip_f = f.copy() + clip_f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'quality': quality_key(quality), + 'source_preference': int_or_none(clip_url_data.get('rank')), + }) + formats.append(clip_f) + + self._sort_formats(formats) + + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) + + return { + 'id': clip_id, + 'title': title, + 'duration': duration, + 'creator': author, + 'formats': formats, + 'subtitles': subtitles, + } + + +class PluralsightCourseIE(PluralsightBaseIE): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' + _TESTS = [{ + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + }, { + # available without pluralsight account + 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', + 'only_matching': True, + }, { + 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + # TODO: PSM cookie + + course = self._download_course(course_id, url, course_id) + + title = course['title'] + course_name = course['name'] + course_data = course['modules'] + description = course.get('description') or course.get('shortDescription') + + entries = [] + for num, module in enumerate(course_data, 1): + author = module.get('author') + module_name = module.get('name') + if not author or not module_name: + continue + for clip in module.get('clips', []): + clip_index = int_or_none(clip.get('index')) + if clip_index is None: + continue + clip_url = update_url_query( + '%s/player' % self._API_BASE, query={ + 'mode': 'live', + 'course': course_name, + 'author': author, + 'name': module_name, + 'clip': clip_index, + }) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) + + return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dlc/extractor/podomatic.py b/youtube_dlc/extractor/podomatic.py new file mode 100644 index 0000000..e782e3f --- /dev/null +++ b/youtube_dlc/extractor/podomatic.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PodomaticIE(InfoExtractor): + IE_NAME = 'podomatic' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' + + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + channel = mobj.group('channel') or mobj.group('channel_2') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, 'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + if not video_url: + video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int_or_none(data.get('length'), 1000) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/youtube_dlc/extractor/pokemon.py b/youtube_dlc/extractor/pokemon.py new file mode 100644 index 0000000..14ee1a7 --- /dev/null +++ b/youtube_dlc/extractor/pokemon.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + js_to_json, + merge_dicts, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', + 'info_dict': { + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', + 'ext': 'mp4', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', + 'info_dict': { + 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', + 'ext': 'mp4', + 'title': "Pokémon : L'ascension de Darkrai", + 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } + + +class PokemonWatchIE(InfoExtractor): + _VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/player\.html\?id=(?P<id>[a-z0-9]{32})' + _API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}' + _TESTS = [{ + 'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667', + 'md5': '62833938a31e61ab49ada92f524c42ff', + 'info_dict': { + 'id': '8309a40969894a8e8d5bc1311e9c5667', + 'ext': 'mp4', + 'title': 'Lillier and the Staff!', + 'description': 'md5:338841b8c21b283d24bdc9b568849f04', + } + }, { + 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', + 'only_matching': True + }] + + def _extract_media(self, channel_array, video_id): + for channel in channel_array: + for media in channel.get('media'): + if media.get('id') == video_id: + return media + return None + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = { + '_type': 'url', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + # API call can be avoided entirely if we are listing formats + if self._downloader.params.get('listformats', False): + return info + + webpage = self._download_webpage(url, video_id) + build_vars = self._parse_json(self._search_regex( + r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'), + video_id, transform_source=js_to_json) + region = build_vars.get('region') + channel_array = self._download_json(self._API_URL.format(region), video_id) + video_data = self._extract_media(channel_array, video_id) + + if video_data is None: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + info['_type'] = 'url_transparent' + images = video_data.get('images') + + return merge_dicts(info, { + 'title': video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': images.get('medium') or images.get('small'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('season')), + 'episode': video_data.get('title'), + 'episode_number': int_or_none(video_data.get('episode')), + }) diff --git a/youtube_dlc/extractor/polskieradio.py b/youtube_dlc/extractor/polskieradio.py new file mode 100644 index 0000000..978d6f8 --- /dev/null +++ b/youtube_dlc/extractor/polskieradio.py @@ -0,0 +1,180 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, + compat_urlparse +) +from ..utils import ( + extract_attributes, + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'<title>([^<]+) - [^<]+ - [^<]+', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff --git a/youtube_dlc/extractor/popcorntimes.py b/youtube_dlc/extractor/popcorntimes.py new file mode 100644 index 0000000..7bf7f98 --- /dev/null +++ b/youtube_dlc/extractor/popcorntimes.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_chr, +) +from ..utils import int_or_none + + +class PopcorntimesIE(InfoExtractor): + _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', + 'md5': '93f210991ad94ba8c3485950a2453257', + 'info_dict': { + 'id': 'A1XCFvz', + 'display_id': 'haensel-und-gretel-opera-fantasy', + 'ext': 'mp4', + 'title': 'Hänsel und Gretel', + 'description': 'md5:1b8146791726342e7b22ce8125cf6945', + 'thumbnail': r're:^https?://.*\.jpg$', + 'creator': 'John Paul', + 'release_date': '19541009', + 'duration': 4260, + 'tbr': 5380, + 'width': 720, + 'height': 540, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._search_regex( + r'

([^<]+)', webpage, 'title', + default=None) or self._html_search_meta( + 'ya:ovs:original_name', webpage, 'title', fatal=True) + + loc = self._search_regex( + r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', + group='value') + + loc_b64 = '' + for c in loc: + c_ord = ord(c) + if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): + upper = ord('Z') if c_ord <= ord('Z') else ord('z') + c_ord += 13 + if upper < c_ord: + c_ord -= 26 + loc_b64 += compat_chr(c_ord) + + video_url = compat_b64decode(loc_b64).decode('utf-8') + + description = self._html_search_regex( + r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'thumbnail', default=None, + group='value') or self._og_search_thumbnail(webpage) + + creator = self._html_search_meta( + 'video:director', webpage, 'creator', default=None) + + release_date = self._html_search_meta( + 'video:release_date', webpage, default=None) + if release_date: + release_date = release_date.replace('-', '') + + def int_meta(name): + return int_or_none(self._html_search_meta( + name, webpage, default=None)) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'creator': creator, + 'release_date': release_date, + 'duration': int_meta('video:duration'), + 'tbr': int_meta('ya:ovs:bitrate'), + 'width': int_meta('og:video:width'), + 'height': int_meta('og:video:height'), + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dlc/extractor/popcorntv.py b/youtube_dlc/extractor/popcorntv.py new file mode 100644 index 0000000..9f834fb --- /dev/null +++ b/youtube_dlc/extractor/popcorntv.py @@ -0,0 +1,76 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/porn91.py b/youtube_dlc/extractor/porn91.py new file mode 100644 index 0000000..20eac64 --- /dev/null +++ b/youtube_dlc/extractor/porn91.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + ExtractorError, +) + + +class Porn91IE(InfoExtractor): + IE_NAME = '91porn' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P[\w\d]+)' + + _TEST = { + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': '7fcdb5349354f40d41689bd0fa8db05a', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'ext': 'mp4', + 'duration': 431, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + self._set_cookie('91porn.com', 'language', 'cn_CN') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) + + if '作为游客,你每天只可观看10个视频' in webpage: + raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + + title = self._search_regex( + r'
([^<]+)
', webpage, 'title') + title = title.replace('\n', '') + + video_link_url = self._search_regex( + r']+id=["\']fm-video_link[^>]+>([^<]+)', + webpage, 'video link') + videopage = self._download_webpage(video_link_url, video_id) + + info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + + comment_count = int_or_none(self._search_regex( + r'留言:\s*\s*(\d+)', webpage, 'comment count', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'duration': duration, + 'comment_count': comment_count, + 'age_limit': self._rta_search(webpage), + }) + + return info_dict diff --git a/youtube_dlc/extractor/porncom.py b/youtube_dlc/extractor/porncom.py new file mode 100644 index 0000000..5726cab --- /dev/null +++ b/youtube_dlc/extractor/porncom.py @@ -0,0 +1,103 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + (r'=\s*({.+?})\s*;\s*v1ar\b', + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)

'), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + (r'Views:\s*\s*\s*([\d,.]+)', + r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), + r'(?s)]*>%s:(.+?)

' % kind.capitalize()), + webpage, kind, fatal=False) + return re.findall(r']+>([^<]+)', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff --git a/youtube_dlc/extractor/pornhd.py b/youtube_dlc/extractor/pornhd.py new file mode 100644 index 0000000..c6052ac --- /dev/null +++ b/youtube_dlc/extractor/pornhd.py @@ -0,0 +1,121 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + merge_dicts, + urljoin, +) + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': '87f1540746c1d32ec7a2305c12b96b25', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + 'skip': 'HTTP Error 404: Not Found', + }, { + 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', + 'info_dict': { + 'id': '1962', + 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + 'ext': 'mp4', + 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', + 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + 'age_limit': 18, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id or video_id) + + title = self._html_search_regex( + [r']+class=["\']video-name["\'][^>]*>([^<]+)', + r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') + + sources = self._parse_json(js_to_json(self._search_regex( + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", + webpage, 'sources', default='{}')), video_id) + + info = {} + if not sources: + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info = entries[0] + + if not sources and not info: + message = self._html_search_regex( + r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', + r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, + 'thumbnail', default=None, group='url') + + like_count = int_or_none(self._search_regex( + (r'(\d+)
\s*likes', + r'(\d+)\s*]+>(?: |\s)*\blikes', + r'class=["\']save-count["\'][^>]*>\s*(\d+)'), + webpage, 'like count', fatal=False)) + + return merge_dicts(info, { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'like_count': like_count, + 'formats': formats, + 'age_limit': 18, + }) diff --git a/youtube_dlc/extractor/pornhub.py b/youtube_dlc/extractor/pornhub.py new file mode 100644 index 0000000..529f3f7 --- /dev/null +++ b/youtube_dlc/extractor/pornhub.py @@ -0,0 +1,618 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools +import itertools +import operator +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urllib_request, +) +from .openload import PhantomJSwrapper +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + NO_DEFAULT, + orderedSet, + remove_quotes, + str_to_int, + url_or_none, +) + + +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[\da-z]+) + ''' + _TESTS = [{ + 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', + 'md5': 'a6391306d050e4547f62b3f485dd9ba9', + 'info_dict': { + 'id': '648719015', + 'ext': 'mp4', + 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', + 'uploader': 'Babes', + 'upload_date': '20130628', + 'timestamp': 1372447216, + 'duration': 361, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + }, { + # non-ASCII title + 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', + 'info_dict': { + 'id': '1331683002', + 'ext': 'mp4', + 'title': '重庆婷婷女王足交', + 'upload_date': '20150213', + 'timestamp': 1423804862, + 'duration': 1753, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + # subtitles + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', + 'info_dict': { + 'id': 'ph5af5fef7c2aa7', + 'ext': 'mp4', + 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', + 'uploader': 'BFFs', + 'duration': 622, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': list, + 'categories': list, + 'subtitles': { + 'en': [{ + "ext": 'srt' + }] + }, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video has been disabled', + }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', + 'only_matching': True, + }, { + # removed at the request of cam4.com + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + webpage) + + def _extract_count(self, pattern, webpage, name): + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') or 'pornhub.com' + video_id = mobj.group('id') + + if 'premium' in host: + if not self._downloader.params.get('cookiefile'): + raise ExtractorError( + 'PornHub Premium requires authentication.' + ' You may want to use --cookies.', + expected=True) + + self._set_cookie(host, 'age_verified', '1') + + def dl_webpage(platform): + self._set_cookie(host, 'platform', platform) + return self._download_webpage( + 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), + video_id, 'Downloading %s webpage' % platform) + + webpage = dl_webpage('pc') + + error_msg = self._html_search_regex( + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + webpage, 'error message', default=None, group='error') + if error_msg: + error_msg = re.sub(r'\s+', ' ', error_msg) + raise ExtractorError( + 'PornHub said: %s' % error_msg, + expected=True, video_id=video_id) + + # video_title from flashvars contains whitespace instead of non-ASCII (see + # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying + # on that anymore. + title = self._html_search_meta( + 'twitter:title', webpage, default=None) or self._html_search_regex( + (r'(?s)]+class=["\']title["\'][^>]*>(?P.+?)</h1>', + r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title') + + video_urls = [] + video_urls_set = set() + subtitles = {} + + flashvars = self._parse_json( + self._search_regex( + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + video_id) + if flashvars: + subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) + if subtitle_url: + subtitles.setdefault('en', []).append({ + 'url': subtitle_url, + 'ext': 'srt', + }) + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) + else: + thumbnail, duration = [None] * 2 + + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): + assignments = self._search_regex( + pattern, webpage, 'encoded url', default=default) + if not assignments: + return {} + + assignments = assignments.split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + return js_vars + + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + default=None) + if js_vars: + for key, format_url in js_vars.items(): + if any(key.startswith(p) for p in FORMAT_PREFIXES): + add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') + add_video_url(js_vars['mediastring']) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + upload_date = None + formats = [] + for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') + ext = determine_ext(video_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) + + video_uploader = self._html_search_regex( + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', + webpage, 'uploader', default=None) + + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view') + like_count = self._extract_count( + r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count( + r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + comment_count = self._extract_count( + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') + + def extract_list(meta_key): + div = self._search_regex( + r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' + % meta_key, webpage, meta_key, default=None) + if div: + return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) + + info = self._search_json_ld(webpage, video_id, default={}) + # description provided in JSON-LD is irrelevant + info['description'] = None + + return merge_dicts({ + 'id': video_id, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'formats': formats, + 'age_limit': 18, + 'tags': extract_list('tags'), + 'categories': extract_list('categories'), + 'subtitles': subtitles, + }, info) + + +class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_entries(self, webpage, host): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/ytdl-org/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + + return [ + self.url_result( + 'http://www.%s/%s' % (host, video_url), + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + playlist_id = mobj.group('id') + + webpage = self._download_webpage(url, playlist_id) + + entries = self._extract_entries(webpage, host) + + playlist = self._parse_json( + self._search_regex( + r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, + 'playlist', default='{}'), + playlist_id, fatal=False) + title = playlist.get('title') or self._search_regex( + r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) + + return self.playlist_result( + entries, playlist_id, title, playlist.get('description')) + + +class PornHubUserIE(PornHubPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph', + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious', + 'info_dict': { + 'id': 'liz-vicious', + }, + 'playlist_mincount': 118, + }, { + 'url': 'https://www.pornhub.com/users/russianveet69', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/channels/povd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('id') + return self.url_result( + '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), + video_id=user_id) + + +class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): + @staticmethod + def _has_more(webpage): + return re.search( + r'''(?x) + <li[^>]+\bclass=["\']page_next| + <link[^>]+\brel=["\']next| + <button[^>]+\bid=["\']moreDataBtn + ''', webpage) is not None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + page = int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + + entries = [] + for page_num in (page, ) if page is not None else itertools.count(1): + try: + webpage = self._download_webpage( + url, item_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + raise + page_entries = self._extract_entries(webpage, host) + if not page_entries: + break + entries.extend(page_entries) + if not self._has_more(webpage): + break + + return self.playlist_result(orderedSet(entries), item_id) + + +class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/model/zoe_ph/videos', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 149, + }, { + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', + 'info_dict': { + 'id': 'pornstar/jenny-blighe/videos', + }, + 'playlist_mincount': 40, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'channels/povd/videos', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', + 'only_matching': True, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', + 'only_matching': True, + }, { + # Longest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', + 'only_matching': True, + }, { + # Newest Videos + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/search?search=123', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/categories/teen?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/hd?page=3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/described-video?page=2', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', + 'only_matching': True, + }, { + 'url': 'https://www.pornhub.com/playlist/44121572', + 'info_dict': { + 'id': 'playlist/44121572', + }, + 'playlist_mincount': 132, + }, { + 'url': 'https://www.pornhub.com/playlist/4667351', + 'only_matching': True, + }, { + 'url': 'https://de.pornhub.com/playlist/4667351', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False + if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) + else super(PornHubPagedVideoListIE, cls).suitable(url)) + + +class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' + _TESTS = [{ + 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', + 'info_dict': { + 'id': 'jenny-blighe', + }, + 'playlist_mincount': 129, + }, { + 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', + 'only_matching': True, + }] diff --git a/youtube_dlc/extractor/pornotube.py b/youtube_dlc/extractor/pornotube.py new file mode 100644 index 0000000..1b5b9a3 --- /dev/null +++ b/youtube_dlc/extractor/pornotube.py @@ -0,0 +1,85 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PornotubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science', + 'md5': '60fc5a4f0d93a97968fc7999d98260c9', + 'info_dict': { + 'id': '4964', + 'ext': 'mp4', + 'upload_date': '20141203', + 'title': 'Weird Hot and Wet Science', + 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', + 'categories': ['Adult Humor', 'Blondes'], + 'uploader': 'Alpha Blue Archives', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1417582800, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] + + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] + + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) + + info = self._download_json( + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] + + timestamp = int_or_none(info.get('publishDate'), scale=1000) + uploader = info.get('studios', [{}])[0].get('name') + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': info.get('description'), + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'thumbnail': thumbnail, + 'categories': categories, + 'age_limit': 18, + } diff --git a/youtube_dlc/extractor/pornovoisines.py b/youtube_dlc/extractor/pornovoisines.py new file mode 100644 index 0000000..b6b7106 --- /dev/null +++ b/youtube_dlc/extractor/pornovoisines.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + + +class PornoVoisinesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' + + _TEST = { + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', + 'info_dict': { + 'id': '919', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': 'Recherche appartement', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20140925', + 'duration': 120, + 'view_count': int, + 'average_rating': float, + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], + 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + # The webpage has a bug - there's no space between "thumb" and src= + thumbnail = self._html_search_regex( + r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') + + upload_date = unified_strdate(self._search_regex( + r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') + view_count = int_or_none(self._search_regex( + r'(\d+) vues', webpage, 'view count', fatal=False)) + average_rating = self._search_regex( + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) + if average_rating: + average_rating = float_or_none(average_rating.replace(',', '.')) + + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) + if categories: + categories = [category.strip() for category in categories.split(',')] + + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'categories': categories, + 'age_limit': 18, + 'subtitles': subtitles, + } diff --git a/youtube_dlc/extractor/pornoxo.py b/youtube_dlc/extractor/pornoxo.py new file mode 100644 index 0000000..2831368 --- /dev/null +++ b/youtube_dlc/extractor/pornoxo.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + str_to_int, +) + + +class PornoXOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' + _TEST = { + 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', + 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', + 'info_dict': { + 'id': '7564', + 'ext': 'flv', + 'title': 'Striptease From Sexy Secretary!', + 'display_id': 'striptease-from-sexy-secretary', + 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.groups() + + webpage = self._download_webpage(url, video_id) + video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') + + view_count = str_to_int(self._html_search_regex( + r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) + + categories_str = self._html_search_regex( + r'<meta name="description" content=".*featuring\s*([^"]+)"', + webpage, 'categories', fatal=False) + categories = ( + None if categories_str is None + else categories_str.split(',')) + + video_data.update({ + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'description': self._html_search_meta('description', webpage), + 'categories': categories, + 'view_count': view_count, + 'age_limit': 18, + }) + + return video_data diff --git a/youtube_dlc/extractor/presstv.py b/youtube_dlc/extractor/presstv.py new file mode 100644 index 0000000..b5c2792 --- /dev/null +++ b/youtube_dlc/extractor/presstv.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import remove_start + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' + + _TEST = { + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', + 'info_dict': { + 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', + 'ext': 'mp4', + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + # extract video URL from webpage + video_url = self._hidden_inputs(webpage)['inpPlayback'] + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + _formats = [ + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') + ] + + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] + + # extract video metadata + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') + + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + upload_date = '%04d%02d%02d' % ( + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), + ) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } diff --git a/youtube_dlc/extractor/prosiebensat1.py b/youtube_dlc/extractor/prosiebensat1.py new file mode 100644 index 0000000..e470882 --- /dev/null +++ b/youtube_dlc/extractor/prosiebensat1.py @@ -0,0 +1,500 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from hashlib import sha1 +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + merge_dicts, + unified_strdate, +) + + +class ProSiebenSat1BaseIE(InfoExtractor): + _GEO_BYPASS = False + _ACCESS_ID = None + _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' + _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' + + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = [] + if self._ACCESS_ID: + raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID + protocols = self._download_json( + self._V4_BASE_URL + 'protocols', clip_id, + 'Downloading protocols JSON', + headers=self.geo_verification_headers(), query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'video_id': clip_id, + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') + if server_token: + urls = (self._download_json( + self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ + 'access_id': self._ACCESS_ID, + 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'protocols': self._SUPPORTED_PROTOCOLS, + 'server_token': server_token, + 'video_id': clip_id, + }, fatal=False) or {}).get('urls') or {} + for protocol, variant in urls.items(): + source_url = variant.get('clear', {}).get('url') + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + if not formats: + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': float_or_none(video.get('duration')), + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): + IE_NAME = 'prosiebensat1' + IE_DESC = 'ProSiebenSat.1 Digital' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?:beta\.)? + (?: + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia + )\.(?:de|at|ch)| + ran\.de|fem\.com|advopedia\.de|galileo\.tv/video + ) + /(?P<id>.+) + ''' + + _TESTS = [ + { + # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 + # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction + 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', + 'info_dict': { + 'id': '2104602', + 'ext': 'mp4', + 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', + 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', + 'upload_date': '20131231', + 'duration': 5845.04, + 'series': 'CIRCUS HALLIGALLI', + 'season_number': 2, + 'episode': 'Episode 18 - Staffel 2', + 'episode_number': 18, + }, + }, + { + 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', + 'info_dict': { + 'id': '2570327', + 'ext': 'mp4', + 'title': 'Lady-Umstyling für Audrina', + 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', + 'upload_date': '20131014', + 'duration': 606.76, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Seems to be broken', + }, + { + 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', + 'info_dict': { + 'id': '2429369', + 'ext': 'mp4', + 'title': 'Countdown für die Autowerkstatt', + 'description': 'md5:809fc051a457b5d8666013bc40698817', + 'upload_date': '20140223', + 'duration': 2595.04, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', + 'info_dict': { + 'id': '2904997', + 'ext': 'mp4', + 'title': 'Sexy laufen in Ugg Boots', + 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', + 'upload_date': '20140122', + 'duration': 245.32, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', + 'info_dict': { + 'id': '2906572', + 'ext': 'mp4', + 'title': 'Im Interview: Kai Wiesinger', + 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', + 'upload_date': '20140203', + 'duration': 522.56, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', + 'info_dict': { + 'id': '2992323', + 'ext': 'mp4', + 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', + 'description': 'md5:2669cde3febe9bce13904f701e774eb6', + 'upload_date': '20141014', + 'duration': 2410.44, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', + 'info_dict': { + 'id': '3004256', + 'ext': 'mp4', + 'title': 'Schalke: Tönnies möchte Raul zurück', + 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', + 'upload_date': '20140226', + 'duration': 228.96, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'This video is unavailable', + }, + { + 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', + 'info_dict': { + 'id': '2572814', + 'ext': 'mp4', + 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', + 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'timestamp': 1382041620, + 'upload_date': '20131017', + 'duration': 469.88, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', + 'info_dict': { + 'id': '2156342', + 'ext': 'mp4', + 'title': 'Kurztrips zum Valentinstag', + 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', + 'duration': 307.24, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', + 'info_dict': { + 'id': '439664', + 'title': 'Episode 8 - Ganze Folge - Playlist', + 'description': 'md5:63b8963e71f481782aeea877658dec84', + }, + 'playlist_count': 2, + 'skip': 'This video is unavailable', + }, + { + # title in <h2 class="subtitle"> + 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', + 'info_dict': { + 'id': '4895826', + 'ext': 'mp4', + 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', + 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', + 'upload_date': '20170302', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'geo restricted to Germany', + }, + { + # geo restricted to Germany + 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', + 'only_matching': True, + }, + { + # geo restricted to Germany + 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', + 'only_matching': True, + }, + { + 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', + 'only_matching': True, + }, + { + 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', + 'only_matching': True, + }, + ] + + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' + + _ACCESS_ID = 'x_prosiebenmaxx-de' + _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' + _IV = 'Aeluchoc6aevechuipiexeeboowedaok' + + _CLIPID_REGEXES = [ + r'"clip_id"\s*:\s+"(\d+)"', + r'clipid: "(\d+)"', + r'clip[iI]d=(\d+)', + r'clip[iI][dD]\s*=\s*["\'](\d+)', + r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", + r'proMamsId"\s*:\s*"(\d+)', + r'proMamsId"\s*:\s*"(\d+)', + ] + _TITLE_REGEXES = [ + r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', + r'<header class="clearfix">\s*<h3>(.+?)</h3>', + r'<!-- start video -->\s*<h1>(.+?)</h1>', + r'<h1 class="att-name">\s*(.+?)</h1>', + r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', + r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', + r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', + r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', + ] + _DESCRIPTION_REGEXES = [ + r'<p itemprop="description">\s*(.+?)</p>', + r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', + r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', + r'<p class="att-description">\s*(.+?)\s*</p>', + r'<p class="video-description" itemprop="description">\s*(.+?)</p>', + r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', + ] + _UPLOAD_DATE_REGEXES = [ + r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', + r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', + r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', + r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', + ] + _PAGE_TYPE_REGEXES = [ + r'<meta name="page_type" content="([^"]+)">', + r"'itemType'\s*:\s*'([^']*)'", + ] + _PLAYLIST_ID_REGEXES = [ + r'content[iI]d=(\d+)', + r"'itemId'\s*:\s*'([^']*)'", + ] + _PLAYLIST_CLIP_REGEXES = [ + r'(?s)data-qvt=.+?<a href="([^"]+)"', + ] + + def _extract_clip(self, url, webpage): + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') + title = self._html_search_regex( + self._TITLE_REGEXES, webpage, 'title', + default=None) or self._og_search_title(webpage) + info = self._extract_video_info(url, clip_id) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate( + self._html_search_meta('og:published_time', webpage, + 'upload date', default=None) + or self._html_search_regex(self._UPLOAD_DATE_REGEXES, + webpage, 'upload date', default=None)) + + json_ld = self._search_json_ld(webpage, clip_id, default={}) + + return merge_dicts(info, { + 'id': clip_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + }, json_ld) + + def _extract_playlist(self, url, webpage): + playlist_id = self._html_search_regex( + self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') + playlist = self._parse_json( + self._search_regex( + r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', + webpage, 'playlist'), + playlist_id) + entries = [] + for item in playlist: + clip_id = item.get('id') or item.get('upc') + if not clip_id: + continue + info = self._extract_video_info(url, clip_id) + info.update({ + 'id': clip_id, + 'title': item.get('title') or item.get('teaser', {}).get('headline'), + 'description': item.get('teaser', {}).get('description'), + 'thumbnail': item.get('poster'), + 'duration': float_or_none(item.get('duration')), + 'series': item.get('tvShowTitle'), + 'uploader': item.get('broadcastPublisher'), + }) + entries.append(info) + return self.playlist_result(entries, playlist_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_type = self._search_regex( + self._PAGE_TYPE_REGEXES, webpage, + 'page type', default='clip').lower() + if page_type == 'clip': + return self._extract_clip(url, webpage) + elif page_type == 'playlist': + return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff --git a/youtube_dlc/extractor/puhutv.py b/youtube_dlc/extractor/puhutv.py new file mode 100644 index 0000000..ca71665 --- /dev/null +++ b/youtube_dlc/extractor/puhutv.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + parse_resolution, + str_or_none, + try_get, + unified_timestamp, + url_or_none, + urljoin, +) + + +class PuhuTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' + IE_NAME = 'puhutv' + _TESTS = [{ + # film + 'url': 'https://puhutv.com/sut-kardesler-izle', + 'md5': 'a347470371d56e1585d1b2c8dab01c96', + 'info_dict': { + 'id': '5085', + 'display_id': 'sut-kardesler', + 'ext': 'mp4', + 'title': 'Süt Kardeşler', + 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4832.44, + 'creator': 'Arzu Film', + 'timestamp': 1561062602, + 'upload_date': '20190620', + 'release_year': 1976, + 'view_count': int, + 'tags': list, + }, + }, { + # episode, geo restricted, bypassable with --geo-verification-proxy + 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', + 'only_matching': True, + }, { + # 4k, with subtitles + 'url': 'https://puhutv.com/dip-1-bolum-izle', + 'only_matching': True, + }] + _SUBTITLE_LANGS = { + 'English': 'en', + 'Deutsch': 'de', + 'عربى': 'ar' + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-izle' % display_id), + display_id)['data'] + + video_id = compat_str(info['id']) + show = info.get('title') or {} + title = info.get('name') or show['name'] + if info.get('display_name'): + title = '%s %s' % (title, info['display_name']) + + try: + videos = self._download_json( + 'https://puhutv.com/api/assets/%s/videos' % video_id, + display_id, 'Downloading video JSON', + headers=self.geo_verification_headers()) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self.raise_geo_restricted() + raise + + urls = [] + formats = [] + + for video in videos['data']['videos']: + media_url = url_or_none(video.get('url')) + if not media_url or media_url in urls: + continue + urls.append(media_url) + + playlist = video.get('is_playlist') + if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + + quality = int_or_none(video.get('quality')) + f = { + 'url': media_url, + 'ext': 'mp4', + 'height': quality + } + video_format = video.get('video_format') + is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False + if is_hls: + format_id = 'hls' + f['protocol'] = 'm3u8_native' + elif video_format == 'mp4': + format_id = 'http' + else: + continue + if quality: + format_id += '-%sp' % quality + f['format_id'] = format_id + formats.append(f) + self._sort_formats(formats) + + creator = try_get( + show, lambda x: x['producer']['name'], compat_str) + + content = info.get('content') or {} + + images = try_get( + content, lambda x: x['images']['wide'], dict) or {} + thumbnails = [] + for image_id, image_url in images.items(): + if not isinstance(image_url, compat_str): + continue + if not image_url.startswith(('http', '//')): + image_url = 'https://%s' % image_url + t = parse_resolution(image_id) + t.update({ + 'id': image_id, + 'url': image_url + }) + thumbnails.append(t) + + tags = [] + for genre in show.get('genres') or []: + if not isinstance(genre, dict): + continue + genre_name = genre.get('name') + if genre_name and isinstance(genre_name, compat_str): + tags.append(genre_name) + + subtitles = {} + for subtitle in content.get('subtitles') or []: + if not isinstance(subtitle, dict): + continue + lang = subtitle.get('language') + sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) + if not lang or not isinstance(lang, compat_str) or not sub_url: + continue + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'url': sub_url + }] + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': info.get('description') or show.get('description'), + 'season_id': str_or_none(info.get('season_id')), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'release_year': int_or_none(show.get('released_at')), + 'timestamp': unified_timestamp(info.get('created_at')), + 'creator': creator, + 'view_count': int_or_none(content.get('watch_count')), + 'duration': float_or_none(content.get('duration_in_ms'), 1000), + 'tags': tags, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class PuhuTVSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' + IE_NAME = 'puhutv:serie' + _TESTS = [{ + 'url': 'https://puhutv.com/deniz-yildizi-detay', + 'info_dict': { + 'title': 'Deniz Yıldızı', + 'id': 'deniz-yildizi', + }, + 'playlist_mincount': 205, + }, { + # a film detail page which is using same url with serie page + 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', + 'only_matching': True, + }] + + def _extract_entries(self, seasons): + for season in seasons: + season_id = season.get('id') + if not season_id: + continue + page = 1 + has_more = True + while has_more is True: + season = self._download_json( + 'https://galadriel.puhutv.com/seasons/%s' % season_id, + season_id, 'Downloading page %s' % page, query={ + 'page': page, + 'per': 40, + }) + episodes = season.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + slug_path = str_or_none(ep.get('slugPath')) + if not slug_path: + continue + video_id = str_or_none(int_or_none(ep.get('id'))) + yield self.url_result( + 'https://puhutv.com/%s' % slug_path, + ie=PuhuTVIE.ie_key(), video_id=video_id, + video_title=ep.get('name') or ep.get('eventLabel')) + page += 1 + has_more = season.get('hasMore') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + info = self._download_json( + urljoin(url, '/api/slug/%s-detay' % playlist_id), + playlist_id)['data'] + + seasons = info.get('seasons') + if seasons: + return self.playlist_result( + self._extract_entries(seasons), playlist_id, info.get('name')) + + # For films, these are using same url with series + video_id = info.get('slug') or info['assets'][0]['slug'] + return self.url_result( + 'https://puhutv.com/%s-izle' % video_id, + PuhuTVIE.ie_key(), video_id) diff --git a/youtube_dlc/extractor/puls4.py b/youtube_dlc/extractor/puls4.py new file mode 100644 index 0000000..80091b8 --- /dev/null +++ b/youtube_dlc/extractor/puls4.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .prosiebensat1 import ProSiebenSat1BaseIE +from ..utils import ( + unified_strdate, + parse_duration, + compat_str, +) + + +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' + _TESTS = [{ + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', + 'info_dict': { + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', + 'uploader': 'PULS_4', + }, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', + 'only_matching': True, + }, { + 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', + 'only_matching': True, + }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' + + def _real_extract(self, url): + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff --git a/youtube_dlc/extractor/pyvideo.py b/youtube_dlc/extractor/pyvideo.py new file mode 100644 index 0000000..b8ac93a --- /dev/null +++ b/youtube_dlc/extractor/pyvideo.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class PyvideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' + + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', + }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') + video_id = mobj.group('id') + + entries = [] + + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) + + if data: + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) + + return self.playlist_result(entries, video_id) diff --git a/youtube_dlc/extractor/qqmusic.py b/youtube_dlc/extractor/qqmusic.py new file mode 100644 index 0000000..084308a --- /dev/null +++ b/youtube_dlc/extractor/qqmusic.py @@ -0,0 +1,369 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re +import time + +from .common import InfoExtractor +from ..utils import ( + clean_html, + ExtractorError, + strip_jsonp, + unescapeHTML, +) + + +class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'mp3', + 'title': '可惜没如果', + 'release_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'release_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'note': 'lyrics not in .lrc format', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', + 'info_dict': { + 'id': '001JyApY11tIp6', + 'ext': 'mp3', + 'title': 'Shadows Over Transylvania', + 'release_date': '19970225', + 'creator': 'Dark Funeral', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }] + + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') + + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ + % (albummid[-2:-1], albummid[-1], albummid) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + + formats = [] + for format_id, details in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) + self._sort_formats(formats) + + actual_lrc_lyrics = ''.join( + line + '\n' for line in re.findall( + r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) + + info_dict = { + 'id': mid, + 'formats': formats, + 'title': song_name, + 'release_date': publish_time, + 'creator': singer, + 'description': lrc_content, + 'thumbnail': thumbnail_url + } + if actual_lrc_lyrics: + info_dict['subtitles'] = { + 'origin': [{ + 'ext': 'lrc', + 'data': actual_lrc_lyrics, + }] + } + return info_dict + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): + entries = [] + + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' + _TEST = { + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:870ec08f7d8547c29c93010899103751', + }, + 'playlist_mincount': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') + singer_name = self._html_search_regex( + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) + singer_desc = None + + if mid: + singer_desc_page = self._download_xml( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', + }, + 'playlist_count': 4, + }, { + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:a48823755615508a95080e81b51ba729', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + mid = self._match_id(url) + + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] + + entries = [ + self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album.get('name') + album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() + + return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', + 'info_dict': { + 'id': '123', + 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', + 'info_dict': { + 'id': '3', + 'title': '巅峰榜·欧美', + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', + }, + 'playlist_count': 100, + }, { + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', + 'info_dict': { + 'id': '106', + 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) + + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] + + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') + return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' + + _TESTS = [{ + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + 'skip': 'playlist gone', + }, { + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', + 'info_dict': { + 'id': '1374105607', + 'title': '易入人心的华语民谣', + 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, + transform_source=strip_jsonp) + if not len(list_json.get('cdlist', [])): + if list_json.get('code'): + raise ExtractorError( + 'QQ Music said: error %d in fetching playlist info' % list_json['code'], + expected=True) + raise ExtractorError('Unable to get playlist info') + + cdlist = list_json['cdlist'][0] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] + + list_name = cdlist.get('dissname') + list_description = clean_html(unescapeHTML(cdlist.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) diff --git a/youtube_dlc/extractor/r7.py b/youtube_dlc/extractor/r7.py new file mode 100644 index 0000000..e2202d6 --- /dev/null +++ b/youtube_dlc/extractor/r7.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P<id>[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) + + title = video['title'] + + formats = [] + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none', formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) + self._sort_formats(formats) + + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) diff --git a/youtube_dlc/extractor/radiobremen.py b/youtube_dlc/extractor/radiobremen.py new file mode 100644 index 0000000..2c35f98 --- /dev/null +++ b/youtube_dlc/extractor/radiobremen.py @@ -0,0 +1,63 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class RadioBremenIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' + IE_NAME = 'radiobremen' + + _TEST = { + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', + 'info_dict': { + 'id': '141876', + 'ext': 'mp4', + 'duration': 178, + 'width': 512, + 'title': 'Druck auf Patrick Öztürk', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id + meta_doc = self._download_webpage( + meta_url, video_id, 'Downloading metadata') + title = self._html_search_regex( + r'<h1.*>(?P<title>.+)</h1>', meta_doc, 'title') + description = self._html_search_regex( + r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False) + duration = parse_duration(self._html_search_regex( + r'Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>', + meta_doc, 'duration', fatal=False)) + + page_doc = self._download_webpage( + url, video_id, 'Downloading video information') + mobj = re.search( + r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", + page_doc) + video_url = ( + "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % + (video_id, video_id, mobj.group("secret"), mobj.group('width'))) + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'width': int(mobj.group('width')), + }] + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': mobj.group('thumbnail'), + } diff --git a/youtube_dlc/extractor/radiocanada.py b/youtube_dlc/extractor/radiocanada.py new file mode 100644 index 0000000..a28b1a2 --- /dev/null +++ b/youtube_dlc/extractor/radiocanada.py @@ -0,0 +1,171 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class RadioCanadaIE(InfoExtractor): + IE_NAME = 'radiocanada' + _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, + } + ] + _GEO_COUNTRIES = ['CA'] + _access_token = None + _claims = None + + def _call_api(self, path, video_id=None, app_code=None, query=None): + if not query: + query = {} + query.update({ + 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', + 'output': 'json', + }) + if video_id: + query.update({ + 'appCode': app_code, + 'idMedia': video_id, + }) + if self._access_token: + query['access_token'] = self._access_token + try: + return self._download_json( + 'https://services.radio-canada.ca/media/' + path, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): + data = self._parse_json(e.cause.read().decode(), None) + error = data.get('error_description') or data['errorMessage']['text'] + raise ExtractorError(error, expected=True) + raise + + def _extract_info(self, app_code, video_id): + metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] + + def get_meta(name): + for meta in metas: + if meta.get('name') == name: + text = meta.get('text') + if text: + return text + + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/ytdl-org/youtube-dl/pull/18609). + if get_meta('protectionType'): + self.report_warning('This video is probably DRM protected.') + + query = { + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + } + if self._claims: + query['claims'] = self._claims + v_data = self._call_api('validation/v2/', video_id, app_code, query) + v_url = v_data.get('url') + if not v_url: + error = v_data['message'] + if error == "Le contenu sélectionné n'est pas disponible dans votre pays": + raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) + if error == 'Le contenu sélectionné est disponible seulement en premium': + self.raise_login_required(error) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') + if closed_caption_url: + subtitles['fr'] = [{ + 'url': closed_caption_url, + 'ext': determine_ext(closed_caption_url, 'vtt'), + }] + + return { + 'id': video_id, + 'title': get_meta('Title') or get_meta('AV-nomEmission'), + 'description': get_meta('Description') or get_meta('ShortDescription'), + 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), + 'duration': int_or_none(get_meta('length')), + 'series': get_meta('Emission'), + 'season_number': int_or_none('SrcSaison'), + 'episode_number': int_or_none('SrcEpisode'), + 'upload_date': unified_strdate(get_meta('Date')), + 'subtitles': subtitles, + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_info(*re.match(self._VALID_URL, url).groups()) + + +class RadioCanadaAudioVideoIE(InfoExtractor): + IE_NAME = 'radiocanada:audiovideo' + _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', + 'info_dict': { + 'id': '7527184', + 'ext': 'mp4', + 'title': 'Barack Obama au Vietnam', + 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', + 'upload_date': '20160523', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) diff --git a/youtube_dlc/extractor/radiode.py b/youtube_dlc/extractor/radiode.py new file mode 100644 index 0000000..2c06c8b --- /dev/null +++ b/youtube_dlc/extractor/radiode.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RadioDeIE(InfoExtractor): + IE_NAME = 'radio.de' + _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' + _TEST = { + 'url': 'http://ndr2.radio.de/', + 'info_dict': { + 'id': 'ndr2', + 'ext': 'mp3', + 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:591c49c702db1a33751625ebfb67f273', + 'thumbnail': r're:^https?://.*\.png', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + radio_id = self._match_id(url) + webpage = self._download_webpage(url, radio_id) + jscode = self._search_regex( + r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n", + webpage, 'broadcast') + + broadcast = self._parse_json(jscode, radio_id) + title = self._live_title(broadcast['name']) + description = broadcast.get('description') or broadcast.get('shortDescription') + thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') + + formats = [{ + 'url': stream['streamUrl'], + 'ext': stream['streamContentFormat'].lower(), + 'acodec': stream['streamContentFormat'], + 'abr': stream['bitRate'], + 'asr': stream['sampleRate'] + } for stream in broadcast['streamUrls']] + self._sort_formats(formats) + + return { + 'id': radio_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/radiofrance.py b/youtube_dlc/extractor/radiofrance.py new file mode 100644 index 0000000..a8afc00 --- /dev/null +++ b/youtube_dlc/extractor/radiofrance.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = 'radiofrance' + + _TEST = { + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + 'title': 'One to one', + 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + 'uploader': 'Thomas Hercouët', + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, 'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit">  © (.*?)</div>', + webpage, 'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, 'audio URLs') + formats = [ + { + 'format_id': fm[0], + 'url': fm[1], + 'vcodec': 'none', + 'preference': i, + } + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } diff --git a/youtube_dlc/extractor/radiojavan.py b/youtube_dlc/extractor/radiojavan.py new file mode 100644 index 0000000..3f74f0c --- /dev/null +++ b/youtube_dlc/extractor/radiojavan.py @@ -0,0 +1,83 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_resolution, + str_to_int, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class RadioJavanIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' + _TEST = { + 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', + 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', + 'info_dict': { + 'id': 'chaartaar-ashoobam', + 'ext': 'mp4', + 'title': 'Chaartaar - Ashoobam', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'upload_date': '20150215', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + download_host = self._download_json( + 'https://www.radiojavan.com/videos/video_host', video_id, + data=urlencode_postdata({'id': video_id}), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }).get('host', 'https://host1.rjmusicmedia.com') + + webpage = self._download_webpage(url, video_id) + + formats = [] + for format_id, _, video_path in re.findall( + r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + f = parse_resolution(format_id) + f.update({ + 'url': urljoin(download_host, video_path), + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + upload_date = unified_strdate(self._search_regex( + r'class="date_added">Date added: ([^<]+)<', + webpage, 'upload date', fatal=False)) + + view_count = str_to_int(self._search_regex( + r'class="views">Plays: ([\d,]+)', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) likes', + webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._search_regex( + r'class="rating">([\d,]+) dislikes', + webpage, 'dislike count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py new file mode 100644 index 0000000..51a310f --- /dev/null +++ b/youtube_dlc/extractor/rai.py @@ -0,0 +1,473 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_str, +) +from ..utils import ( + ExtractorError, + determine_ext, + find_xpath_attr, + fix_xml_ampersands, + GeoRestrictedError, + int_or_none, + parse_duration, + strip_or_none, + try_get, + unified_strdate, + unified_timestamp, + update_url_query, + urljoin, + xpath_text, +) + + +class RaiBaseIE(InfoExtractor): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + _BASE_URL = 'https://www.raiplay.it' + + def _extract_relinker_info(self, relinker_url, video_id): + if not re.match(r'https?://', relinker_url): + return {'formats': [{'url': relinker_url}]} + + formats = [] + geoprotection = None + is_live = None + duration = None + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) + if media_url == 'http://download.rai.it/video_no_available.mp4': + continue + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m' or platform == 'flash': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) + + @staticmethod + def _extract_subtitles(url, subtitle_url): + subtitles = {} + if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = urljoin(url, subtitle_url) + STL_EXT = '.stl' + SRT_EXT = '.srt' + subtitles['it'] = [{ + 'ext': 'stl', + 'url': subtitle_url, + }] + if subtitle_url.endswith(STL_EXT): + srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT + subtitles['it'].append({ + 'ext': 'srt', + 'url': srt_url, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'duration': 6160, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext') + + media = self._download_json( + '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + + title = media['name'] + video = media['video'] + + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) + }) + + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) + + subtitles = self._extract_subtitles(url, video.get('subtitles')) + + info = { + 'id': video_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, + 'subtitles': subtitles, + } + + info.update(relinker_info) + return info + + +class RaiPlayLiveIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + media = self._download_json( + '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id), + display_id, 'Downloading channel JSON') + + title = media['name'] + video = media['video'] + video_id = media['id'].replace('ContentItem-', '') + + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) + + info = { + 'id': video_id, + 'display_id': display_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), + 'duration': parse_duration(video.get('duration')), + } + + info.update(relinker_info) + return info + + +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + media = self._download_json( + '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), + playlist_id, 'Downloading program JSON') + + title = media['name'] + description = media['program_info']['description'] + + content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] + + entries = [] + for cs in content_sets: + medias = self._download_json( + '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), + cs, 'Downloading content set JSON') + for m in medias['items']: + video_url = urljoin(url, m['path_id']) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _TESTS = [{ + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1758, + 'upload_date': '20140612', + } + }, { + # with ContentItem in many metas + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + } + }, { + # with ContentItem in og:url + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103', + } + }, { + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HLS live stream with ContentItem in og:url + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Direct MMS URL + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', + 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, + }] + + def _extract_from_content_id(self, content_id, url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': [{ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }] + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) + + subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + + info = { + 'id': content_id, + 'title': title, + 'description': strip_or_none(media.get('desc')), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'subtitles': subtitles, + } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + if content_item_id: + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info diff --git a/youtube_dlc/extractor/raywenderlich.py b/youtube_dlc/extractor/raywenderlich.py new file mode 100644 index 0000000..5411ece --- /dev/null +++ b/youtube_dlc/extractor/raywenderlich.py @@ -0,0 +1,179 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + merge_dicts, + try_get, + unescapeHTML, + unified_timestamp, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<course_id>[^/]+)/lessons/(?P<id>\d+) + ''' + + _TESTS = [{ + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Introduction', + 'description': 'md5:804d031b3efa9fcb49777d512d74f722', + 'timestamp': 1513906277, + 'upload_date': '20171222', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'only_matching': True, + }] + + @staticmethod + def _extract_video_id(data, lesson_id): + if not data: + return + groups = try_get(data, lambda x: x['groups'], list) or [] + if not groups: + return + for group in groups: + if not isinstance(group, dict): + continue + contents = try_get(data, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + ordinal = int_or_none(content.get('ordinal')) + if ordinal != lesson_id: + continue + video_id = content.get('identifier') + if video_id: + return compat_str(video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + display_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, display_id) + + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'twitter:image', webpage, 'thumbnail') + + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + + info = { + 'thumbnail': thumbnail, + } + + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) + + if not vimeo_id: + data = self._parse_json( + self._search_regex( + r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, + 'data collection', default='{}', group='data'), + display_id, transform_source=unescapeHTML, fatal=False) + video_id = self._extract_video_id( + data, lesson_id) or self._search_regex( + r'/videos/(\d+)/', thumbnail, 'video id') + headers = { + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', default=None) + if csrf_token: + headers['X-CSRF-Token'] = csrf_token + video = self._download_json( + 'https://videos.raywenderlich.com/api/v1/videos/%s.json' + % video_id, display_id, headers=headers)['video'] + vimeo_id = video['clips'][0]['provider_id'] + info.update({ + '_type': 'url_transparent', + 'title': video.get('name'), + 'description': video.get('description') or video.get( + 'meta_description'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('created_at')), + }) + + return merge_dicts(info, self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id)) + + +class RayWenderlichCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + videos\.raywenderlich\.com/courses| + (?:www\.)?raywenderlich\.com + )/ + (?P<id>[^/]+) + ''' + + _TEST = { + 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '3530-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + } + + @classmethod + def suitable(cls, url): + return False if RayWenderlichIE.suitable(url) else super( + RayWenderlichCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + entries = [] + lesson_urls = set() + for lesson_url in re.findall( + r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): + if lesson_url in lesson_urls: + continue + lesson_urls.add(lesson_url) + entries.append(self.url_result( + urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dlc/extractor/rbmaradio.py b/youtube_dlc/extractor/rbmaradio.py new file mode 100644 index 0000000..ae7413f --- /dev/null +++ b/youtube_dlc/extractor/rbmaradio.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + unified_timestamp, + update_url_query, +) + + +class RBMARadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', + 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', + 'info_dict': { + 'id': 'ford-lopatin-live-at-primavera-sound-2011', + 'ext': 'mp3', + 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') + + webpage = self._download_webpage(url, episode_id) + + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] + + title = episode['title'] + + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 192, 256)] + self._check_formats(formats, episode_id) + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) + + return { + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/rds.py b/youtube_dlc/extractor/rds.py new file mode 100644 index 0000000..8c016a7 --- /dev/null +++ b/youtube_dlc/extractor/rds.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + js_to_json, +) +from ..compat import compat_str + + +class RDSIE(InfoExtractor): + IE_DESC = 'RDS.ca' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' + + _TESTS = [{ + 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + 'info_dict': { + 'id': '604333', + 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'ext': 'flv', + 'title': 'Fowler Jr. prend la direction de Jacksonville', + 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', + 'timestamp': 1430397346, + 'upload_date': '20150430', + 'duration': 154.354, + 'age_limit': 0, + } + }, { + 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( + [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', + r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], + webpage, 'thumbnail', fatal=False) + timestamp = parse_iso8601(self._search_regex( + r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"', + webpage, 'upload date', fatal=False)) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"', + webpage, 'duration', fatal=False)) + age_limit = self._family_friendly_search(webpage) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'url': '9c9media:rds_web:%s' % video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dlc/extractor/redbulltv.py b/youtube_dlc/extractor/redbulltv.py new file mode 100644 index 0000000..3aae79f --- /dev/null +++ b/youtube_dlc/extractor/redbulltv.py @@ -0,0 +1,229 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?P<id>AP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', + 'info_dict': { + 'id': 'AP-1Q6XCDTAN1W11', + 'ext': 'mp4', + 'title': 'ABC of... WRC - ABC of... S1E6', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', + 'info_dict': { + 'id': 'AP-1PMHKJFCW1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + 'duration': 904, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', + 'only_matching': True, + }] + + def extract_info(self, video_id): + session = self._download_json( + 'https://api.redbull.tv/v3/session', video_id, + note='Downloading access token', query={ + 'category': 'personal_computer', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + token = session['token'] + + try: + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, + video_id, note='Downloading video information', + headers={'Authorization': token} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['error'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise + + title = video['title'].strip() + + formats = self._extract_m3u8_formats( + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + subtitles = {} + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) + + subheading = video.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': video.get('long_description') or video.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.extract_info(video_id) + + +class RedBullEmbedIE(RedBullTVIE): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' + _TESTS = [{ + # HLS manifest accessible only using assetId + 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', + 'only_matching': True, + }] + _VIDEO_ESSENSE_TMPL = '''... on %s { + videoEssence { + attributes + } + }''' + + def _real_extract(self, url): + rrn_id = self._match_id(url) + asset_id = self._download_json( + 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', + rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'}, + query={ + 'query': '''{ + resource(id: "%s", enforceGeoBlocking: false) { + %s + %s + } +}''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), + })['data']['resource']['videoEssence']['attributes']['assetId'] + return self.extract_info(asset_id) + + +class RedBullTVRrnContentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/tv/(?:video|live|film)/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', + 'only_matching': True, + }] + + def _real_extract(self, url): + region, lang, rrn_id = re.search(self._VALID_URL, url).groups() + rrn_id += ':%s-%s' % (lang, region.upper()) + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) + + +class RedBullIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P<region>[a-z]{2,3})-(?P<lang>[a-z]{2})/(?P<type>(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', + 'md5': 'db8271a7200d40053a1809ed0dd574ff', + 'info_dict': { + 'id': 'AA-1MT8DQWA91W14', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + }, + }, { + 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + # only available on the int-en website so a fallback is need for the API + # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero + 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', + 'only_matching': True, + }] + _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] + _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] + + def _real_extract(self, url): + region, lang, filter_type, display_id = re.search(self._VALID_URL, url).groups() + if filter_type == 'episodes': + filter_type = 'episode-videos' + elif filter_type == 'live': + filter_type = 'live-videos' + + regions = [region.upper()] + if region != 'int': + if region in self._LAT_FALLBACK_MAP: + regions.append('LAT') + if lang in self._INT_FALLBACK_LIST: + regions.append('INT') + locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) + + rrn_id = self._download_json( + 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, + display_id, query={ + 'filter[type]': filter_type, + 'filter[uriSlug]': display_id, + 'rb3Schema': 'v1:hero', + })['data']['id'] + + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) diff --git a/youtube_dlc/extractor/reddit.py b/youtube_dlc/extractor/reddit.py new file mode 100644 index 0000000..663f622 --- /dev/null +++ b/youtube_dlc/extractor/reddit.py @@ -0,0 +1,130 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, + url_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') + + video_id = self._match_id(url) + + data = self._download_json( + url + '/.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': url_or_none(data.get('thumbnail')), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff --git a/youtube_dlc/extractor/redtube.py b/youtube_dlc/extractor/redtube.py new file mode 100644 index 0000000..a1ca791 --- /dev/null +++ b/youtube_dlc/extractor/redtube.py @@ -0,0 +1,136 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + merge_dicts, + str_to_int, + unified_strdate, + url_or_none, +) + + +class RedTubeIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.redtube.com/66418', + 'md5': 'fc08071233725f26b8f014dba9590005', + 'info_dict': { + 'id': '66418', + 'ext': 'mp4', + 'title': 'Sucked on a toilet', + 'upload_date': '20110811', + 'duration': 596, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', + 'only_matching': True, + }, { + 'url': 'http://it.redtube.com/66418', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.redtube.com/%s' % video_id, video_id) + + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) + + info = self._search_json_ld(webpage, video_id, default={}) + + if not info.get('title'): + info['title'] = self._html_search_regex( + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + medias = self._parse_json( + self._search_regex( + r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', + webpage, 'upload date', default=None)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', + r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), + webpage, 'view count', default=None)) + + # No self-labeling, but they describe themselves as + # "Home of Videos Porno" + age_limit = 18 + + return merge_dicts(info, { + 'id': video_id, + 'ext': 'mp4', + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'age_limit': age_limit, + 'formats': formats, + }) diff --git a/youtube_dlc/extractor/regiotv.py b/youtube_dlc/extractor/regiotv.py new file mode 100644 index 0000000..e250a52 --- /dev/null +++ b/youtube_dlc/extractor/regiotv.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + sanitized_Request, + xpath_text, + xpath_with_ns, +) + + +class RegioTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.regio-tv.de/video/395808.html', + 'info_dict': { + 'id': '395808', + 'ext': 'mp4', + 'title': 'Wir in Ludwigsburg', + 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', + } + }, { + 'url': 'http://www.regio-tv.de/video/395808', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + key = self._search_regex( + r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key') + title = self._og_search_title(webpage) + + SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' + + request = sanitized_Request( + 'http://v.telvi.de/', + SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) + video_data = self._download_xml(request, video_id, 'Downloading video XML') + + NS_MAP = { + 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', + 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', + } + + video_url = xpath_text( + video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) + thumbnail = xpath_text( + video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') + description = self._og_search_description( + webpage) or self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/rentv.py b/youtube_dlc/extractor/rentv.py new file mode 100644 index 0000000..7c8909d --- /dev/null +++ b/youtube_dlc/extractor/rentv.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + url_or_none, +) + + +class RENTVIE(InfoExtractor): + _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ren.tv/video/epizod/118577', + 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': '118577', + 'ext': 'mp4', + 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', + 'timestamp': 1472230800, + 'upload_date': '20160826', + } + }, { + 'url': 'http://ren.tv/player/118577', + 'only_matching': True, + }, { + 'url': 'rentv:118577', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) + config = self._parse_json(self._search_regex( + r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) + title = config['title'] + formats = [] + for video in config['src']: + src = url_or_none(video.get('src')) + if not src: + continue + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'duration': int_or_none(config.get('duration')), + 'timestamp': int_or_none(config.get('date')), + 'formats': formats, + } + + +class RENTVArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', + 'md5': 'ebd63c4680b167693745ab91343df1d6', + 'info_dict': { + 'id': '136472', + 'ext': 'mp4', + 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', + 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', + } + }, { + # TODO: invalid m3u8 + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + entries = [] + for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): + media_id = config_profile.get('mediaid') + if not media_id: + continue + media_id = compat_str(media_id) + entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) + return self.playlist_result(entries, display_id) diff --git a/youtube_dlc/extractor/restudy.py b/youtube_dlc/extractor/restudy.py new file mode 100644 index 0000000..d47fb45 --- /dev/null +++ b/youtube_dlc/extractor/restudy.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RestudyIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.restudy.dk/video/play/id/1637', + 'info_dict': { + 'id': '1637', + 'ext': 'flv', + 'title': 'Leiden-frosteffekt', + 'description': 'Denne video er et eksperiment med flydende kvælstof.', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + + formats = self._extract_smil_formats( + 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, + video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/reuters.py b/youtube_dlc/extractor/reuters.py new file mode 100644 index 0000000..9dc482d --- /dev/null +++ b/youtube_dlc/extractor/reuters.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + int_or_none, + unescapeHTML, +) + + +class ReutersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', + 'md5': '8015113643a0b12838f160b0b81cc2ee', + 'info_dict': { + 'id': '368575562', + 'ext': 'mp4', + 'title': 'San Francisco police chief resigns', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) + video_data = js_to_json(self._search_regex( + r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', + webpage, 'video data')) + + def get_json_value(key, fatal=False): + return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) + + title = unescapeHTML(get_json_value('title', fatal=True)) + mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() + + mas_data = self._download_json( + 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), + video_id, transform_source=js_to_json) + formats = [] + for f in mas_data: + f_url = f.get('url') + if not f_url: + continue + method = f.get('method') + if method == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + container = f.get('container') + ext = '3gp' if method == 'mobile' else container + formats.append({ + 'format_id': ext, + 'url': f_url, + 'ext': ext, + 'container': container if method != 'mobile' else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': get_json_value('thumb'), + 'duration': int_or_none(get_json_value('seconds')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/reverbnation.py b/youtube_dlc/extractor/reverbnation.py new file mode 100644 index 0000000..4cb99c2 --- /dev/null +++ b/youtube_dlc/extractor/reverbnation.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + qualities, + str_or_none, +) + + +class ReverbNationIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' + _TESTS = [{ + 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', + 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', + 'info_dict': { + 'id': '16965047', + 'ext': 'mp3', + 'title': 'MONA LISA', + 'uploader': 'ALKILADOS', + 'uploader_id': '216429', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + api_res = self._download_json( + 'https://api.reverbnation.com/song/%s' % song_id, + song_id, + note='Downloading information of song %s' % song_id + ) + + THUMBNAILS = ('thumbnail', 'image') + quality = qualities(THUMBNAILS) + thumbnails = [] + for thumb_key in THUMBNAILS: + if api_res.get(thumb_key): + thumbnails.append({ + 'url': api_res[thumb_key], + 'preference': quality(thumb_key) + }) + + return { + 'id': song_id, + 'title': api_res['name'], + 'url': api_res['url'], + 'uploader': api_res.get('artist', {}).get('name'), + 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), + 'thumbnails': thumbnails, + 'ext': 'mp3', + 'vcodec': 'none', + } diff --git a/youtube_dlc/extractor/rice.py b/youtube_dlc/extractor/rice.py new file mode 100644 index 0000000..f855719 --- /dev/null +++ b/youtube_dlc/extractor/rice.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_iso8601, + ExtractorError, +) + + +class RICEIE(InfoExtractor): + _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' + _TEST = { + 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', + 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', + 'info_dict': { + 'id': 'YEWIvbhb40aqdjMD1ALSqw', + 'ext': 'mp4', + 'title': 'Active Learning in Archeology', + 'upload_date': '20140616', + 'timestamp': 1402926346, + } + } + _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): + raise ExtractorError('Invalid URL', expected=True) + + portal_id = qs['PortalID'][0] + playlist_id = qs['DestinationID'][0] + content_id = qs['ContentID'][0] + + content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ + 'portalId': portal_id, + 'playlistId': playlist_id, + 'contentId': content_id + }) + metadata = xpath_element(content_data, './/metaData', fatal=True) + title = xpath_text(metadata, 'primaryTitle', fatal=True) + encodings = xpath_element(content_data, './/encodings', fatal=True) + player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ + 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), + 'contentId': content_id, + }) + + common_fmt = {} + dimensions = xpath_text(encodings, 'dimensions') + if dimensions: + wh = dimensions.split('x') + if len(wh) == 2: + common_fmt.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + + formats = [] + rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) + if rtsp_path: + fmt = { + 'url': rtsp_path, + 'format_id': 'rtsp', + } + fmt.update(common_fmt) + formats.append(fmt) + for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): + video_url = xpath_text(source, self._xpath_ns('File', self._NS)) + if not video_url: + continue + if '.m3u8' in video_url: + formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + fmt = { + 'url': video_url, + 'format_id': video_url.split(':')[0], + } + fmt.update(common_fmt) + rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + }) + formats.append(fmt) + self._sort_formats(formats) + + thumbnails = [] + for content_asset in content_data.findall('.//contentAssets'): + asset_type = xpath_text(content_asset, 'type') + if asset_type == 'image': + image_url = xpath_text(content_asset, 'httpPath') + if not image_url: + continue + thumbnails.append({ + 'id': xpath_text(content_asset, 'ID'), + 'url': image_url, + }) + + return { + 'id': content_id, + 'title': title, + 'description': xpath_text(metadata, 'abstract'), + 'duration': int_or_none(xpath_text(metadata, 'duration')), + 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/rmcdecouverte.py b/youtube_dlc/extractor/rmcdecouverte.py new file mode 100644 index 0000000..c3623ed --- /dev/null +++ b/youtube_dlc/extractor/rmcdecouverte.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import smuggle_url + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' + + _TESTS = [{ + 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', + 'info_dict': { + 'id': '5983675500001', + 'ext': 'mp4', + 'title': 'CORVETTE', + 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', + 'uploader_id': '1969646226001', + 'upload_date': '20181226', + 'timestamp': 1545861635, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'only available for a week', + }, { + # live, geo restricted, bypassable + 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') or mobj.group('live_id') + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['FR']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dlc/extractor/ro220.py b/youtube_dlc/extractor/ro220.py new file mode 100644 index 0000000..69934ef --- /dev/null +++ b/youtube_dlc/extractor/ro220.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class Ro220IE(InfoExtractor): + IE_NAME = '220.ro' + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/', + 'md5': '03af18b73a07b4088753930db7a34add', + 'info_dict': { + 'id': 'LYV6doKo7f', + 'ext': 'mp4', + 'title': 'Luati-le Banii sez 4 ep 1', + 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + url = compat_urllib_parse_unquote(self._search_regex( + r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url')) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = [{ + 'format_id': 'sd', + 'url': url, + 'ext': 'mp4', + }] + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/rockstargames.py b/youtube_dlc/extractor/rockstargames.py new file mode 100644 index 0000000..cd6904b --- /dev/null +++ b/youtube_dlc/extractor/rockstargames.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class RockstarGamesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.rockstargames.com/videos/video/11544/', + 'md5': '03b5caa6e357a4bd50e3143fc03e5733', + 'info_dict': { + 'id': '11544', + 'ext': 'mp4', + 'title': 'Further Adventures in Finance and Felony Trailer', + 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1464876000, + 'upload_date': '20160602', + } + }, { + 'url': 'http://www.rockstargames.com/videos#/?video=48', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.rockstargames.com/videoplayer/videos/get-video.json', + video_id, query={ + 'id': video_id, + 'locale': 'en_us', + })['video'] + + title = video['title'] + + formats = [] + for video in video['files_processed']['video/mp4']: + if not video.get('src'): + continue + resolution = video.get('resolution') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', resolution or '', 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(video['src']), + 'format_id': resolution, + 'height': height, + }) + + if not formats: + youtube_id = video.get('youtube_id') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('screencap')), + 'timestamp': parse_iso8601(video.get('created')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/roosterteeth.py b/youtube_dlc/extractor/roosterteeth.py new file mode 100644 index 0000000..8883639 --- /dev/null +++ b/youtube_dlc/extractor/roosterteeth.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '9156', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'only_matching': True, + }] + _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _real_initialize(self): + if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): + return + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + api_episode_url = self._EPISODE_BASE_URL + display_id + + try: + m3u8_url = self._download_json( + api_episode_url + '/videos', display_id, + 'Downloading video JSON metadata')['data'][0]['attributes']['url'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + raise + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + episode = self._download_json( + api_episode_url, display_id, + 'Downloading episode JSON metadata')['data'][0] + attributes = episode['attributes'] + title = attributes.get('title') or attributes['display_title'] + video_id = compat_str(episode['id']) + + thumbnails = [] + for image in episode.get('included', {}).get('images', []): + if image.get('type') == 'episode_image': + img_attributes = image.get('attributes') or {} + for k in ('thumb', 'small', 'medium', 'large'): + img_url = img_attributes.get(k) + if img_url: + thumbnails.append({ + 'id': k, + 'url': img_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': attributes.get('description') or attributes.get('caption'), + 'thumbnails': thumbnails, + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(episode.get('uuid')), + 'formats': formats, + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + } diff --git a/youtube_dlc/extractor/rottentomatoes.py b/youtube_dlc/extractor/rottentomatoes.py new file mode 100644 index 0000000..14c8e82 --- /dev/null +++ b/youtube_dlc/extractor/rottentomatoes.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE + + +class RottenTomatoesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + + _TEST = { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + 'info_dict': { + 'id': '11028566', + 'ext': 'mp4', + 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + + return { + '_type': 'url_transparent', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, + 'title': self._og_search_title(webpage), + } diff --git a/youtube_dlc/extractor/roxwel.py b/youtube_dlc/extractor/roxwel.py new file mode 100644 index 0000000..6528464 --- /dev/null +++ b/youtube_dlc/extractor/roxwel.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' + + _TEST = { + 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', + 'info_dict': { + 'id': 'passionpittakeawalklive', + 'ext': 'flv', + 'title': 'Take A Walk (live)', + 'uploader': 'Passion Pit', + 'uploader_id': 'passionpit', + 'upload_date': '20120928', + 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info = self._download_json(info_url, filename) + + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return { + 'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dlc/extractor/rozhlas.py b/youtube_dlc/extractor/rozhlas.py new file mode 100644 index 0000000..fccf694 --- /dev/null +++ b/youtube_dlc/extractor/rozhlas.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff --git a/youtube_dlc/extractor/rtbf.py b/youtube_dlc/extractor/rtbf.py new file mode 100644 index 0000000..3b0f308 --- /dev/null +++ b/youtube_dlc/extractor/rtbf.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, +) + + +class RTBFIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }] + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + + def _real_extract(self, url): + live, media_id = re.match(self._VALID_URL, url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + data = self._parse_json(self._html_search_regex( + r'data-media="([^"]+)"', embed_page, 'media data'), media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = data['title'] + is_live = data.get('isLive') + if is_live: + title = self._live_title(title) + height_re = r'-(\d+)p\.' + formats = [] + + m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = data.get('urlDash') + if not data.get('drm') and mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, media_id, mpd_id='dash', fatal=False)) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + subtitles = {} + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dlc/extractor/rte.py b/youtube_dlc/extractor/rte.py new file mode 100644 index 0000000..1fbc729 --- /dev/null +++ b/youtube_dlc/extractor/rte.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + float_or_none, + parse_iso8601, + str_or_none, + try_get, + unescapeHTML, + url_or_none, + ExtractorError, +) + + +class RteBaseIE(InfoExtractor): + def _real_extract(self, url): + item_id = self._match_id(url) + + info_dict = {} + formats = [] + + ENDPOINTS = ( + 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', + ) + + for num, ep_url in enumerate(ENDPOINTS, start=1): + try: + data = self._download_json(ep_url + item_id, item_id) + except ExtractorError as ee: + if num < len(ENDPOINTS) or formats: + continue + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise + + # NB the string values in the JSON are stored using XML escaping(!) + show = try_get(data, lambda x: x['shows'][0], dict) + if not show: + continue + + if not info_dict: + title = unescapeHTML(show['title']) + description = unescapeHTML(show.get('description')) + thumbnail = show.get('thumbnail') + duration = float_or_none(show.get('duration'), 1000) + timestamp = parse_iso8601(show.get('published')) + info_dict = { + 'id': item_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + } + + mg = try_get(show, lambda x: x['media:group'][0], dict) + if not mg: + continue + + if mg.get('url'): + m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) + if m: + m = m.groupdict() + formats.append({ + 'url': m['url'] + '/' + m['app'], + 'app': m['app'], + 'play_path': m['playpath'], + 'player_url': url, + 'ext': 'flv', + 'format_id': 'rtmp', + }) + + if mg.get('hls_server') and mg.get('hls_url'): + formats.extend(self._extract_m3u8_formats( + mg['hls_server'] + mg['hls_url'], item_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + if mg.get('hds_server') and mg.get('hds_url'): + formats.extend(self._extract_f4m_formats( + mg['hds_server'] + mg['hds_url'], item_id, + f4m_id='hds', fatal=False)) + + mg_rte_server = str_or_none(mg.get('rte:server')) + mg_url = str_or_none(mg.get('url')) + if mg_rte_server and mg_url: + hds_url = url_or_none(mg_rte_server + mg_url) + if hds_url: + formats.extend(self._extract_f4m_formats( + hds_url, item_id, f4m_id='hds', fatal=False)) + + self._sort_formats(formats) + + info_dict['formats'] = formats + return info_dict + + +class RteIE(RteBaseIE): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, + }, + } + + +class RteRadioIE(RteBaseIE): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have two distinct specifier formats, + # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: + # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An <id> uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' + + _TESTS = [{ + # Old-style player URL; HLS and RTMPE formats + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + }, { + # New-style player URL; RTMPE formats only + 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', + 'info_dict': { + 'id': '3250678', + 'ext': 'flv', + 'title': 'The Lyric Concert with Paul Herriott', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': '', + 'timestamp': 1333742400, + 'upload_date': '20120406', + 'duration': 7199.016, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] diff --git a/youtube_dlc/extractor/rtl2.py b/youtube_dlc/extractor/rtl2.py new file mode 100644 index 0000000..70f000c --- /dev/null +++ b/youtube_dlc/extractor/rtl2.py @@ -0,0 +1,207 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_b64decode, + compat_ord, + compat_str, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + int_or_none, + strip_or_none, +) + + +class RTL2IE(InfoExtractor): + IE_NAME = 'rtl2' + _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', + 'info_dict': { + 'id': 'folge-203-0', + 'ext': 'f4v', + 'title': 'GRIP sucht den Sommerkönig', + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }, { + 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', + 'info_dict': { + 'id': 'anna-erwischt-alex', + 'ext': 'mp4', + 'title': 'Anna erwischt Alex!', + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], + }] + + def _real_extract(self, url): + vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() + if not vico_id: + webpage = self._download_webpage(url, display_id) + + mobj = re.search( + r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + + info = self._download_json( + 'https://service.rtl2.de/api-player-vipo/video.php', + display_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) + video_info = info['video'] + title = video_info['titel'] + + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'preference': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) + + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': title, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), + 'formats': formats, + } + + +class RTL2YouBaseIE(InfoExtractor): + _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' + + +class RTL2YouIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you' + _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', + 'info_dict': { + 'id': '15740', + 'ext': 'mp4', + 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', + 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', + 'age_limit': 12, + }, + }, { + 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', + 'only_matching': True, + }] + _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' + _GEO_COUNTRIES = ['DE'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) + + data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') + stream_url = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(compat_b64decode(data)), + bytes_to_intlist(self._AES_KEY), + bytes_to_intlist(compat_b64decode(iv)) + )) + if b'rtl2_you_video_not_found' in stream_url: + raise ExtractorError('video not found', expected=True) + + formats = self._extract_m3u8_formats( + stream_url[:-compat_ord(stream_url[-1])].decode(), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + video_data = self._download_json( + self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) + + series = video_data.get('formatTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(video_data.get('description')), + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), + 'series': series, + 'episode': episode, + 'age_limit': int_or_none(video_data.get('minimumAge')), + } + + +class RTL2YouSeriesIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you:series' + _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://you.rtl2.de/videos/115/dragon-ball', + 'info_dict': { + 'id': '115', + }, + 'playlist_mincount': 5, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'videos', + series_id, query={ + 'formatId': series_id, + 'limit': 1000000000, + }) + + entries = [] + for video in stream_data.get('videos', []): + video_id = compat_str(video['videoId']) + if not video_id: + continue + entries.append(self.url_result( + 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), + 'RTL2You', video_id)) + return self.playlist_result(entries, series_id) diff --git a/youtube_dlc/extractor/rtlnl.py b/youtube_dlc/extractor/rtlnl.py new file mode 100644 index 0000000..9eaa06f --- /dev/null +++ b/youtube_dlc/extractor/rtlnl.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class RtlNlIE(InfoExtractor): + IE_NAME = 'rtl.nl' + IE_DESC = 'rtl.nl and rtlxl.nl' + _VALID_URL = r'''(?x) + https?://(?:(?:www|static)\.)? + (?: + rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/| + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)| + embed\.rtl\.nl/\#uuid= + ) + (?P<id>[0-9a-f-]+)''' + + _TESTS = [{ + # new URL schema + 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f', + 'md5': '490428f1187b60d714f34e1f2e3af0b6', + 'info_dict': { + 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1593293400, + 'upload_date': '20200627', + 'duration': 661.08, + }, + }, { + # old URL schema + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', + 'info_dict': { + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, + }, + 'skip': '404', + }, { + # best format available a3t + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'md5': 'dea7474214af1271d91ef332fb8be7ea', + 'info_dict': { + 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', + 'ext': 'mp4', + 'timestamp': 1424039400, + 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'upload_date': '20150215', + 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', + } + }, { + # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275) + # best format available nettv + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, + }, { + # encrypted m3u8 streams, georestricted + 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', + 'only_matching': True, + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', + 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, + }, { + 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', + 'only_matching': True, + }, { + 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', + 'only_matching': True, + }, { + # new embed URL schema + 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'only_matching': True, + }] + + def _real_extract(self, url): + uuid = self._match_id(url) + info = self._download_json( + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, + uuid) + + material = info['material'][0] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') + + meta = info.get('meta', {}) + + videopath = material['videopath'] + m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath + + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + thumbnails = [] + + for p in ('poster_base_url', '"thumb_base_url"'): + if not meta.get(p): + continue + + thumbnails.append({ + 'url': self._proto_relative_url(meta[p] + uuid), + 'width': int_or_none(self._search_regex( + r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), + 'height': int_or_none(self._search_regex( + r'/sz=[0-9]+x([0-9]+)', + meta[p], 'thumbnail height', fatal=False)) + }) + + return { + 'id': uuid, + 'title': title, + 'formats': formats, + 'timestamp': material['original_date'], + 'description': description, + 'duration': parse_duration(material.get('duration')), + 'thumbnails': thumbnails, + } diff --git a/youtube_dlc/extractor/rtp.py b/youtube_dlc/extractor/rtp.py new file mode 100644 index 0000000..02986f4 --- /dev/null +++ b/youtube_dlc/extractor/rtp.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + js_to_json, +) + + +class RTPIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' + _TESTS = [{ + 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', + 'md5': 'e736ce0c665e459ddb818546220b4ef8', + 'info_dict': { + 'id': 'e174042', + 'ext': 'mp3', + 'title': 'Paixões Cruzadas', + 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + 'twitter:title', webpage, display_name='title', fatal=True) + + config = self._parse_json(self._search_regex( + r'(?s)RTPPlayer\(({.+?})\);', webpage, + 'player config'), video_id, js_to_json) + file_url = config['file'] + ext = determine_ext(file_url) + if ext == 'm3u8': + file_key = config.get('fileKey') + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=file_key) + if file_key: + formats.append({ + 'url': 'https://cdn-ondemand.rtp.pt' + file_key, + 'preference': 1, + }) + self._sort_formats(formats) + else: + formats = [{ + 'url': file_url, + 'ext': ext, + }] + if config.get('mediaType') == 'audio': + for f in formats: + f['vcodec'] = 'none' + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': self._html_search_meta(['description', 'twitter:description'], webpage), + 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), + } diff --git a/youtube_dlc/extractor/rts.py b/youtube_dlc/extractor/rts.py new file mode 100644 index 0000000..48f17b8 --- /dev/null +++ b/youtube_dlc/extractor/rts.py @@ -0,0 +1,230 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .srgssr import SRGSSRIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, + determine_ext, +) + + +class RTSIE(SRGSSRIE): + IE_DESC = 'RTS.ch' + _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' + + _TESTS = [ + { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'info_dict': { + 'id': '3449373', + 'display_id': 'les-enfants-terribles', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + }, + { + 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', + 'info_dict': { + 'id': '5624065', + 'title': 'Passe-moi les jumelles', + }, + 'playlist_mincount': 4, + }, + { + 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', + 'info_dict': { + 'id': '5745975', + 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', + 'ext': 'mp4', + 'duration': 48, + 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', + 'description': 'Hockey - Playoff', + 'uploader': 'Hockey', + 'upload_date': '20140403', + 'timestamp': 1396556882, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Blocked outside Switzerland', + }, + { + 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', + 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'info_dict': { + 'id': '5745356', + 'display_id': 'londres-cachee-par-un-epais-smog', + 'ext': 'mp4', + 'duration': 33, + 'title': 'Londres cachée par un épais smog', + 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', + 'uploader': 'L\'actu en vidéo', + 'upload_date': '20140403', + 'timestamp': 1396537322, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + }, + { + 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', + 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', + 'info_dict': { + 'id': '5706148', + 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', + 'ext': 'mp3', + 'duration': 123, + 'title': '"Urban Hippie", de Damien Krisl', + 'description': 'Des Hippies super glam.', + 'upload_date': '20140403', + 'timestamp': 1396551600, + }, + }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + media_id = m.group('rts_id') or m.group('id') + display_id = m.group('display_id') or media_id + + def download_json(internal_id): + return self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, + display_id) + + all_info = download_json(media_id) + + # media_id extracted out of URL is not always a real id + if 'video' not in all_info and 'audio' not in all_info: + entries = [] + + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', + page) + if not videos: + videos = re.findall( + r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) + + internal_id = self._html_search_regex( + r'<(?:video|audio) data-id="([0-9]+)"', page, + 'internal video id') + all_info = download_json(internal_id) + + media_type = 'video' if 'video' in all_info else 'audio' + + # check for errors + self.get_media_data('rts', media_type, media_id) + + info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] + + title = info['title'] + + def extract_bitrate(url): + return int_or_none(self._search_regex( + r'-([0-9]+)k\.', url, 'bitrate', default=None)) + + formats = [] + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: + continue + if format_id == 'hls_sd' and 'hls' in streams: + continue + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'tbr': extract_bitrate(format_url), + }) + + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': 'http://download-video.rts.ch/' + media_url, + 'tbr': rate or extract_bitrate(media_url), + }) + + self._check_formats(formats, media_id) + self._sort_formats(formats) + + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + + return { + 'id': media_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'description': info.get('intro'), + 'duration': duration, + 'view_count': int_or_none(info.get('plays')), + 'uploader': info.get('programName'), + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), + } diff --git a/youtube_dlc/extractor/rtve.py b/youtube_dlc/extractor/rtve.py new file mode 100644 index 0000000..ce9db06 --- /dev/null +++ b/youtube_dlc/extractor/rtve.py @@ -0,0 +1,292 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import re +import time + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_struct_unpack, +) +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + remove_end, + remove_start, + sanitized_Request, + std_headers, +) + + +def _decrypt_url(png): + encrypted_data = compat_b64decode(png) + text_index = encrypted_data.find(b'tEXt') + text_chunk = encrypted_data[text_index - 4:] + length = compat_struct_unpack('!I', text_chunk[:4])[0] + # Use bytearray to get integers when iterating in both python 2.x and 3.x + data = bytearray(text_chunk[8:8 + length]) + data = [chr(b) for b in data if b != 0] + hash_index = data.index('#') + alphabet_data = data[:hash_index] + url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] + + alphabet = [] + e = 0 + d = 0 + for l in alphabet_data: + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in url_data: + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + return url + + +class RTVEALaCartaIE(InfoExtractor): + IE_NAME = 'rtve.es:alacarta' + IE_DESC = 'RTVE a la carta' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'info_dict': { + 'id': '2491869', + 'ext': 'mp4', + 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, + }, + }, { + 'note': 'Live stream', + 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', + 'info_dict': { + 'id': '1694255', + 'ext': 'flv', + 'title': 'TODO', + }, + 'skip': 'The f4m manifest can\'t be used yet', + }, { + 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', + 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'info_dict': { + 'id': '4236788', + 'ext': 'mp4', + 'title': 'Servir y proteger - Capítulo 104 ', + 'duration': 3222.0, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, + }, { + 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', + 'only_matching': True, + }, { + 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', + 'only_matching': True, + }] + + def _real_initialize(self): + user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') + manager_info = self._download_json( + 'http://www.rtve.es/odin/loki/' + user_agent_b64, + None, 'Fetching manager info') + self._manager = manager_info['manager'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + if info['state'] == 'DESPU': + raise ExtractorError('The video is no longer available', expected=True) + title = info['title'] + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) + png_request = sanitized_Request(png_url) + png_request.add_header('Referer', url) + png = self._download_webpage(png_request, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + ext = determine_ext(video_url) + + formats = [] + if not video_url.endswith('.f4m') and ext != 'm3u8': + if '?' not in video_url: + video_url = video_url.replace('resources/', 'auth/resources/') + video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': video_url, + }) + self._sort_formats(formats) + + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('image'), + 'page_url': url, + 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + + +class RTVEInfantilIE(InfoExtractor): + IE_NAME = 'rtve.es:infantil' + IE_DESC = 'RTVE infantil' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' + + _TESTS = [{ + 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', + 'md5': '915319587b33720b8e0357caaa6617e6', + 'info_dict': { + 'id': '3040283', + 'ext': 'mp4', + 'title': 'Maneras de vivir', + 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'duration': 357.958, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, + video_id)['page']['items'][0] + + webpage = self._download_webpage(url, video_id) + vidplayer_id = self._search_regex( + r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') + + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + video_url = _decrypt_url(png) + + return { + 'id': video_id, + 'ext': 'mp4', + 'title': info['title'], + 'url': video_url, + 'thumbnail': info.get('image'), + 'duration': float_or_none(info.get('duration'), scale=1000), + } + + +class RTVELiveIE(InfoExtractor): + IE_NAME = 'rtve.es:live' + IE_DESC = 'RTVE.es live streams' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'http://www.rtve.es/directo/la-1/', + 'info_dict': { + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + }, + 'params': { + 'skip_download': 'live stream', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + start_time = time.gmtime() + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') + title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) + + vidplayer_id = self._search_regex( + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id + png = self._download_webpage(png_url, video_id, 'Downloading url information') + m3u8_url = _decrypt_url(png) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dlc/extractor/rtvnh.py b/youtube_dlc/extractor/rtvnh.py new file mode 100644 index 0000000..6a00f70 --- /dev/null +++ b/youtube_dlc/extractor/rtvnh.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class RTVNHIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.rtvnh.nl/video/131946', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', + 'info_dict': { + 'id': '131946', + 'ext': 'mp4', + 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', + 'thumbnail': r're:^https?:.*\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta = self._parse_json(self._download_webpage( + 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + + status = meta.get('status') + if status != 200: + raise ExtractorError( + '%s returned error code %d' % (self.IE_NAME, status), expected=True) + + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) + + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': meta['title'].strip(), + 'thumbnail': meta.get('image'), + 'formats': formats + } diff --git a/youtube_dlc/extractor/rtvs.py b/youtube_dlc/extractor/rtvs.py new file mode 100644 index 0000000..6573b26 --- /dev/null +++ b/youtube_dlc/extractor/rtvs.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTVSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' + _TESTS = [{ + # radio archive + 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', + 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', + 'info_dict': { + 'id': '414872', + 'ext': 'mp3', + 'title': 'Ostrov pokladov 1 časť.mp3' + }, + 'params': { + 'skip_download': True, + } + }, { + # tv archive + 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', + 'md5': '85e2c55cf988403b70cac24f5c086dc6', + 'info_dict': { + 'id': '63118', + 'ext': 'mp4', + 'title': 'Amaro Džives - Náš deň', + 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'playlist url', group='url') + + data = self._download_json( + playlist_url, video_id, 'Downloading playlist')[0] + return self._parse_jwplayer_data(data, video_id=video_id) diff --git a/youtube_dlc/extractor/ruhd.py b/youtube_dlc/extractor/ruhd.py new file mode 100644 index 0000000..3c8053a --- /dev/null +++ b/youtube_dlc/extractor/ruhd.py @@ -0,0 +1,45 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RUHDIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _TEST = { + 'url': 'http://www.ruhd.ru/play.php?vid=207', + 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', + 'info_dict': { + 'id': '207', + 'ext': 'divx', + 'title': 'КОТ бааааам', + 'description': 'классный кот)', + 'thumbnail': r're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'<param name="src" value="([^"]+)"', webpage, 'video url') + title = self._html_search_regex( + r'<title>([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', + webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'[\da-z]{32})' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '1d24f180fac7a02f3900712e5a5764d6', + 'info_dict': { + 'id': '3eac3b4561676c17df9132a9a1e62e3e', + 'ext': 'mp4', + 'title': 'Раненный кенгуру забежал в аптеку', + 'description': 'http://www.ntdtv.ru ', + 'duration': 81, + 'uploader': 'NTDRussian', + 'uploader_id': '29790', + 'timestamp': 1381943602, + 'upload_date': '20131016', + 'age_limit': 0, + }, + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_and_extract_info(video_id) + info['formats'] = self._download_and_extract_formats(video_id) + return info + + +class RutubeEmbedIE(RutubeBaseIE): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'timestamp': 1387830582, + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix

восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://rutube.ru/play/embed/8083783', + 'only_matching': True, + }, { + # private video + 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', + 'only_matching': True, + }] + + def _real_extract(self, url): + embed_id = self._match_id(url) + # Query may contain private videos token and should be passed to API + # requests (see #19163) + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + options = self._download_api_options(embed_id, query) + video_id = options['effective_video'] + formats = self._extract_formats(options, video_id) + info = self._download_and_extract_info(video_id, query) + info.update({ + 'extractor_key': 'Rutube', + 'formats': formats, + }) + return info + + +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = url_or_none(result.get('video_url')) + if not video_url: + continue + entry = self._extract_info(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channels' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/tags/video/1800/', + 'info_dict': { + 'id': '1800', + }, + 'playlist_mincount': 68, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' + + +class RutubeMovieIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:movie' + IE_DESC = 'Rutube movies' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P\d+)' + _TESTS = [] + + _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' + _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' + + def _real_extract(self, url): + movie_id = self._match_id(url) + movie = self._download_json( + self._MOVIE_TEMPLATE % movie_id, movie_id, + 'Downloading movie JSON') + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) + + +class RutubePersonIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:person' + IE_DESC = 'Rutube person videos' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P\d+)' + _TESTS = [{ + 'url': 'http://rutube.ru/video/person/313878/', + 'info_dict': { + 'id': '313878', + }, + 'playlist_mincount': 37, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dlc/extractor/rutv.py b/youtube_dlc/extractor/rutv.py new file mode 100644 index 0000000..aceb359 --- /dev/null +++ b/youtube_dlc/extractor/rutv.py @@ -0,0 +1,211 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none +) + + +class RUTVIE(InfoExtractor): + IE_DESC = 'RUTV.RU' + _VALID_URL = r'''(?x) + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P + flash\d+v/container\.swf\?id=| + iframe/(?Pswf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P\d+) + ''' + + _TESTS = [ + { + 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', + 'info_dict': { + 'id': '774471', + 'ext': 'mp4', + 'title': 'Монологи на все времена', + 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', + 'duration': 2906, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', + 'info_dict': { + 'id': '774016', + 'ext': 'mp4', + 'title': 'Чужой в семье Сталина', + 'description': '', + 'duration': 2539, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', + 'info_dict': { + 'id': '766888', + 'ext': 'mp4', + 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', + 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', + 'duration': 279, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', + 'info_dict': { + 'id': '771852', + 'ext': 'mp4', + 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', + 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', + 'duration': 3096, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', + 'info_dict': { + 'id': '51499', + 'ext': 'flv', + 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', + 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', + }, + 'skip': 'Translation has finished', + }, + { + 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', + 'info_dict': { + 'id': '21', + 'ext': 'mp4', + 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, + ] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+?src=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + if mobj: + return mobj.group('url') + + mobj = re.search( + r']+?property=(["\'])og:video\1[^>]+?content=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + video_path = mobj.group('path') + + if re.match(r'flash\d+v', video_path): + video_type = 'video' + elif video_path.startswith('iframe'): + video_type = mobj.group('type') + if video_type == 'swf': + video_type = 'video' + elif video_path.startswith('index/iframe/cast_id'): + video_type = 'live' + + is_live = video_type == 'live' + + json_data = self._download_json( + 'http://player.vgtrk.com/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), + video_id, 'Downloading JSON') + + if json_data['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) + + playlist = json_data['data']['playlist'] + medialist = playlist['medialist'] + media = medialist[0] + + if media['errors']: + raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) + + view_count = playlist.get('count_views') + priority_transport = playlist['priority_transport'] + + thumbnail = media['picture'] + width = int_or_none(media['width']) + height = int_or_none(media['height']) + description = media['anons'] + title = media['title'] + duration = int_or_none(media.get('duration')) + + formats = [] + + for transport, links in media['sources'].items(): + for quality, url in links.items(): + preference = -1 if priority_transport == transport else -2 + if transport == 'rtmp': + mobj = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?P.+)$', url) + if not mobj: + continue + fmt = { + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': 'http://player.rutv.ru', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', + 'rtmp_live': True, + 'ext': 'flv', + 'vbr': int(quality), + 'preference': preference, + } + elif transport == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, 'mp4', preference=preference, m3u8_id='hls')) + continue + else: + fmt = { + 'url': url + } + fmt.update({ + 'width': width, + 'height': height, + 'format_id': '%s-%s' % (transport, quality), + }) + formats.append(fmt) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': description, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'duration': duration, + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dlc/extractor/ruutu.py b/youtube_dlc/extractor/ruutu.py new file mode 100644 index 0000000..f984040 --- /dev/null +++ b/youtube_dlc/extractor/ruutu.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, +) + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/video/2058907', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': '2058907', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.ruutu.fi/video/2057306', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': '2057306', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, + }, + }, + { + 'url': 'http://www.supla.fi/supla/2231370', + 'md5': 'df14e782d49a2c0df03d3be2a54ef949', + 'info_dict': { + 'id': '2231370', + 'ext': 'mp4', + 'title': 'Osa 1: Mikael Jungner', + 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + }, + # Episode where is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, + { + # audio podcast + 'url': 'https://www.supla.fi/supla/3382410', + 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', + 'info_dict': { + 'id': '3382410', + 'ext': 'mp3', + 'title': 'Mikä ihmeen poltergeist?', + 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + }, + 'expected_warnings': ['HTTP Error 502: Bad Gateway'], + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_xml = self._download_xml( + 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + query={'id': video_id}) + + formats = [] + processed_urls = [] + + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if (not video_url or video_url in processed_urls + or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): + continue + processed_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp3' or child.tag == 'AudioMediaFile': + formats.append({ + 'format_id': 'audio', + 'url': video_url, + 'vcodec': 'none', + }) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto + if not self._is_valid_url(video_url, video_id, format_id): + continue + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) + + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if not formats and drm: + raise ExtractorError('This video is DRM protected.', expected=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/ruv.py b/youtube_dlc/extractor/ruv.py new file mode 100644 index 0000000..8f3cc40 --- /dev/null +++ b/youtube_dlc/extractor/ruv.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_timestamp, +) + + +class RuvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P[^/]+(?:/\d+)?)' + _TESTS = [{ + # m3u8 + 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', + 'md5': '66347652f4e13e71936817102acc1724', + 'info_dict': { + 'id': '1144499', + 'display_id': 'fh-valur/20170516', + 'ext': 'mp4', + 'title': 'FH - Valur', + 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', + 'timestamp': 1494963600, + 'upload_date': '20170516', + }, + }, { + # mp3 + 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', + 'md5': '395ea250c8a13e5fdb39d4670ef85378', + 'info_dict': { + 'id': '1153630', + 'display_id': 'morgunutvarpid/20170619', + 'ext': 'mp3', + 'title': 'Morgunútvarpið', + 'description': 'md5:a4cf1202c0a1645ca096b06525915418', + 'timestamp': 1497855000, + 'upload_date': '20170619', + }, + }, { + 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', + 'only_matching': True, + }, { + 'url': 'http://www.ruv.is/node/1151854', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' + + media_url = self._html_search_regex( + FIELD_RE % 'src', webpage, 'video URL', group='url') + + video_id = self._search_regex( + r']+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', + webpage, 'video id', default=display_id) + + ext = determine_ext(media_url) + + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + elif ext == 'mp3': + formats = [{ + 'format_id': 'mp3', + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{ + 'url': media_url, + }] + + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/safari.py b/youtube_dlc/extractor/safari.py new file mode 100644 index 0000000..2cc6651 --- /dev/null +++ b/youtube_dlc/extractor/safari.py @@ -0,0 +1,264 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + update_url_query, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://learning.oreilly.com/api/v1' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + _, urlh = self._download_webpage_handle( + 'https://learning.oreilly.com/accounts/login-check/', None, + 'Downloading login page') + + def is_logged(urlh): + return 'learning.oreilly.com/home/' in urlh.geturl() + + if is_logged(urlh): + self.LOGGED_IN = True + return + + redirect_url = urlh.geturl() + parsed_url = compat_urlparse.urlparse(redirect_url) + qs = compat_parse_qs(parsed_url.query) + next_uri = compat_urlparse.urljoin( + 'https://api.oreilly.com', qs['next'][0]) + + auth, urlh = self._download_json_handle( + 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', + data=json.dumps({ + 'email': username, + 'password': password, + 'redirect_uri': next_uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': redirect_url, + }, expected_status=400) + + credentials = auth.get('credentials') + if (not auth.get('logged_in') and not auth.get('redirect_uri') + and credentials): + raise ExtractorError( + 'Unable to login: %s' % credentials, expected=True) + + # oreilly serves two same instances of the following cookies + # in Set-Cookie header and expects first one to be actually set + for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): + self._apply_first_set_cookie_header(urlh, cookie) + + _, urlh = self._download_webpage_handle( + auth.get('redirect_uri') or next_uri, None, 'Completing login',) + + if is_logged(urlh): + self.LOGGED_IN = True + return + + raise ExtractorError('Unable to log in') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| + videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) + ) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': 'dcc5a425e79f2564148652616af1f2a3', + 'info_dict': { + 'id': '0_qbqx90ic', + 'ext': 'mp4', + 'title': 'Introduction to Hadoop Fundamentals LiveLessons', + 'timestamp': 1437758058, + 'upload_date': '20150724', + 'uploader_id': 'stork', + }, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', + 'only_matching': True, + }] + + _PARTNER_ID = '1926081' + _UICONF_ID = '29375172' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + reference_id = mobj.group('reference_id') + if reference_id: + video_id = reference_id + partner_id = self._PARTNER_ID + ui_id = self._UICONF_ID + else: + video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + mobj = re.match(self._VALID_URL, urlh.geturl()) + reference_id = mobj.group('reference_id') + if not reference_id: + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura widget id', default=self._PARTNER_ID, + group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'kaltura uiconf id', default=self._UICONF_ID, + group='id') + + query = { + 'wid': '_%s' % partner_id, + 'uiconf_id': ui_id, + 'flashvars[referenceId]': reference_id, + } + + if self.LOGGED_IN: + kaltura_session = self._download_json( + '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), + video_id, 'Downloading kaltura session JSON', + 'Unable to download kaltura session JSON', fatal=False, + headers={'Accept': 'application/json'}) + if kaltura_session: + session = kaltura_session.get('session') + if session: + query['flashvars[ks]'] = session + + return self.url_result(update_url_query( + 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), + 'Kaltura') + + +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ + (?: + library/view/[^/]+| + api/v1/book| + videos/[^/]+ + )| + techbus\.safaribooksonline\.com + ) + /(?P[^/]+) + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }, { + 'url': 'http://techbus.safaribooksonline.com/9780134426365', + 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', + 'only_matching': True, + }, { + 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', + 'only_matching': True, + }, { + 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) + else super(SafariCourseIE, cls).suitable(url)) + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, SafariApiIE.ie_key()) + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dlc/extractor/sapo.py b/youtube_dlc/extractor/sapo.py new file mode 100644 index 0000000..49a9b31 --- /dev/null +++ b/youtube_dlc/extractor/sapo.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/savefrom.py b/youtube_dlc/extractor/savefrom.py new file mode 100644 index 0000000..21e44b6 --- /dev/null +++ b/youtube_dlc/extractor/savefrom.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re + +from .common import InfoExtractor + + +class SaveFromIE(InfoExtractor): + IE_NAME = 'savefrom.net' + _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P.*)$' + + _TEST = { + 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', + 'info_dict': { + 'id': 'UlVRAPW2WJY', + 'ext': 'mp4', + 'title': 'About Team Radical MMA | MMA Fighting', + 'upload_date': '20120816', + 'uploader': 'Howcast', + 'uploader_id': 'Howcast', + 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', + }, + 'params': { + 'skip_download': True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = os.path.splitext(url.split('/')[-1])[0] + + return self.url_result(mobj.group('url'), video_id=video_id) diff --git a/youtube_dlc/extractor/sbs.py b/youtube_dlc/extractor/sbs.py new file mode 100644 index 0000000..0e623ff --- /dev/null +++ b/youtube_dlc/extractor/sbs.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) + + +class SBSIE(InfoExtractor): + IE_DESC = 'sbs.com.au' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P[0-9]+)' + + _TESTS = [{ + # Original URL is handled by the generic IE which finds the iframe: + # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation + 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', + 'md5': '3150cf278965eeabb5b4cea1c963fe0a', + 'info_dict': { + 'id': '320403011771', + 'ext': 'mp4', + 'title': 'Dingo Conservation (The Feed)', + 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', + 'thumbnail': r're:http://.*\.jpg', + 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', + }, + }, { + 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', + 'only_matching': True, + }, { + 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) + + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + + urls = player_params['releaseUrls'] + theplatform_url = (urls.get('progressive') or urls.get('html') + or urls.get('standard') or player_params['relatedItemsURL']) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'id': video_id, + 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + } diff --git a/youtube_dlc/extractor/screencast.py b/youtube_dlc/extractor/screencast.py new file mode 100644 index 0000000..69a0d01 --- /dev/null +++ b/youtube_dlc/extractor/screencast.py @@ -0,0 +1,123 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, +) + + +class ScreencastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://www.screencast.com/t/3ZEjQXlT', + 'md5': '917df1c13798a3e96211dd1561fded83', + 'info_dict': { + 'id': '3ZEjQXlT', + 'ext': 'm4v', + 'title': 'Color Measurement with Ocean Optics Spectrometers', + 'description': 'md5:240369cde69d8bed61349a199c5fb153', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/V2uXehPJa1ZI', + 'md5': 'e8e4b375a7660a9e7e35c33973410d34', + 'info_dict': { + 'id': 'V2uXehPJa1ZI', + 'ext': 'mov', + 'title': 'The Amadeus Spectrometer', + 'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/aAB3iowa', + 'md5': 'dedb2734ed00c9755761ccaee88527cd', + 'info_dict': { + 'id': 'aAB3iowa', + 'ext': 'mp4', + 'title': 'Google Earth Export', + 'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://www.screencast.com/t/X3ddTrYh', + 'md5': '669ee55ff9c51988b4ebc0877cc8b159', + 'info_dict': { + 'id': 'X3ddTrYh', + 'ext': 'wmv', + 'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression', + 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'(?:(?!\1).)+)\1', + webpage, 'video url', default=None, group='url') + + if video_url is None: + video_url = self._html_search_meta( + 'og:video', webpage, default=None) + + if video_url is None: + raise ExtractorError('Cannot find video') + + title = self._og_search_title(webpage, default=None) + if title is None: + title = self._html_search_regex( + [r'Title: ([^<]+)
', + r'class="tabSeperator">>(.+?)<', + r'([^<]+)'], + webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage, default=None) + if description is None: + description = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/screencastomatic.py b/youtube_dlc/extractor/screencastomatic.py new file mode 100644 index 0000000..b5e76c9 --- /dev/null +++ b/youtube_dlc/extractor/screencastomatic.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class ScreencastOMaticIE(InfoExtractor): + _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', + 'md5': '483583cb80d92588f15ccbedd90f0c18', + 'info_dict': { + 'id': 'c2lD3BeOPl', + 'ext': 'mp4', + 'title': 'Welcome to 3-4 Philosophy @ DECV!', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + 'duration': 369.163, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + jwplayer_data = self._parse_json( + self._search_regex( + r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), + video_id, transform_source=js_to_json) + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict.update({ + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + }) + return info_dict diff --git a/youtube_dlc/extractor/scrippsnetworks.py b/youtube_dlc/extractor/scrippsnetworks.py new file mode 100644 index 0000000..b40b4c4 --- /dev/null +++ b/youtube_dlc/extractor/scrippsnetworks.py @@ -0,0 +1,152 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import hashlib +import re + +from .aws import AWSIE +from .anvato import AnvatoIE +from .common import InfoExtractor +from ..utils import ( + smuggle_url, + urlencode_postdata, + xpath_text, +) + + +class ScrippsNetworksWatchIE(AWSIE): + IE_NAME = 'scrippsnetworks:watch' + _VALID_URL = r'''(?x) + https?:// + watch\. + (?Pgeniuskitchen)\.com/ + (?: + player\.[A-Z0-9]+\.html\#| + show/(?:[^/]+/){2}| + player/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', + 'info_dict': { + 'id': '4194875', + 'ext': 'mp4', + 'title': 'Ample Hills Ice Cream Bike', + 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', + 'uploader': 'ANV', + 'upload_date': '20171011', + 'timestamp': 1507698000, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [AnvatoIE.ie_key()], + }] + + _SNI_TABLE = { + 'geniuskitchen': 'genius', + } + + _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' + _AWS_PROXY_HOST = 'web.api.video.snidigital.com' + + _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id, video_id = mobj.group('site', 'id') + + aws_identity_id_json = json.dumps({ + 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION + }).encode('utf-8') + token = self._download_json( + 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, + data=aws_identity_id_json, + headers={ + 'Accept': '*/*', + 'Content-Type': 'application/x-amz-json-1.1', + 'Referer': url, + 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), + 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + })['Token'] + + sts = self._download_xml( + 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ + 'Action': 'AssumeRoleWithWebIdentity', + 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', + 'RoleSessionName': 'web-identity', + 'Version': '2011-06-15', + 'WebIdentityToken': token, + }), headers={ + 'Referer': url, + 'X-Amz-User-Agent': self._AWS_USER_AGENT, + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) + + def get(key): + return xpath_text( + sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, + fatal=True) + + mcp_id = self._aws_execute_api({ + 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), + 'access_key': get('AccessKeyId'), + 'secret_key': get('SecretAccessKey'), + 'session_token': get('SessionToken'), + }, video_id)['results'][0]['mcpId'] + + return self.url_result( + smuggle_url( + 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, + {'geo_countries': ['US']}), + AnvatoIE.ie_key(), video_id=mcp_id) + + +class ScrippsNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', + 'info_dict': { + 'id': '0260338', + 'ext': 'mp4', + 'title': 'The Best of the Best', + 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', + 'timestamp': 1475678834, + 'upload_date': '20161005', + 'uploader': 'SCNI-SCND', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', + 'only_matching': True, + }, { + 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', + 'only_matching': True, + }, { + 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', + 'only_matching': True, + }, { + 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', + 'only_matching': True, + }, { + 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', + 'only_matching': True, + }] + _ACCOUNT_MAP = { + 'cookingchanneltv': 2433005105, + 'discovery': 2706091867, + 'diynetwork': 2433004575, + 'foodnetwork': 2433005105, + 'hgtv': 2433004575, + 'travelchannel': 2433005739, + } + _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' + + def _real_extract(self, url): + site, guid = re.match(self._VALID_URL, url).groups() + return self.url_result(smuggle_url( + self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), + {'force_smil_url': True}), 'ThePlatform', guid) diff --git a/youtube_dlc/extractor/scte.py b/youtube_dlc/extractor/scte.py new file mode 100644 index 0000000..ca1de63 --- /dev/null +++ b/youtube_dlc/extractor/scte.py @@ -0,0 +1,144 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + urlencode_postdata, +) + + +class SCTEBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' + _NETRC_MACHINE = 'scte' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_popup = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']welcome\b', r'>Sign Out<')) + + # already logged in + if is_logged(login_popup): + return + + login_form = self._hidden_inputs(login_popup) + + login_form.update({ + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, + 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', + }) + + response = self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + if '|pageRedirect|' not in response and not is_logged(response): + error = self._html_search_regex( + r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', + 'info_dict': { + 'title': 'Introduction to DOCSIS Engineering Professional', + 'id': '31484', + }, + 'playlist_count': 5, + 'skip': 'Requires account credentials', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'

(.+?)

', webpage, 'title') + + context_id = self._search_regex(r'context-(\d+)', webpage, video_id) + content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id + context = decode_packed_codes(self._download_webpage( + '%smobile/data.js' % content_base, video_id)) + + data = self._parse_xml( + self._search_regex( + r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), + video_id) + + entries = [] + for asset in data.findall('.//asset'): + asset_url = asset.get('url') + if not asset_url or not asset_url.endswith('.mp4'): + continue + asset_id = self._search_regex( + r'video_([^_]+)_', asset_url, 'asset id', default=None) + if not asset_id: + continue + entries.append({ + 'id': asset_id, + 'title': title, + 'url': content_base + asset_url, + }) + + return self.playlist_result(entries, video_id, title) + + +class SCTECourseIE(SCTEBaseIE): + _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3639', + 'only_matching': True, + }, { + 'url': 'https://learning.scte.org/course/view.php?id=3073', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + title = self._search_regex( + r'

(.+?)

', webpage, 'title', default=None) + + entries = [] + for mobj in re.finditer( + r'''(?x) + ]+ + href=(["\']) + (?P + https?://learning\.scte\.org/mod/ + (?Pscorm|subcourse)/view\.php?(?:(?!\1).)*? + \bid=\d+ + ) + ''', + webpage): + item_url = mobj.group('url') + if item_url == url: + continue + ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' + else SCTECourseIE.ie_key()) + entries.append(self.url_result(item_url, ie=ie)) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dlc/extractor/seeker.py b/youtube_dlc/extractor/seeker.py new file mode 100644 index 0000000..7872dc8 --- /dev/null +++ b/youtube_dlc/extractor/seeker.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_class, + strip_or_none, +) + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' + _TESTS = [{ + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '897d44bbe0d8986a2ead96de565a92db', + 'info_dict': { + 'id': 'Elrn3gnY', + 'ext': 'mp4', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', + 'timestamp': 1490090165, + 'upload_date': '20170321', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '0497b9f20495174be73ae136949707d2', + 'info_dict': { + 'id': 'FihYQ8AE', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', + 'timestamp': 1490039133, + 'upload_date': '20170320', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + entries = [] + for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): + entries.append(self.url_result( + 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) + return self.playlist_result( + entries, article_id, + self._og_search_title(webpage), + strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) diff --git a/youtube_dlc/extractor/senateisvp.py b/youtube_dlc/extractor/senateisvp.py new file mode 100644 index 0000000..db5ef8b --- /dev/null +++ b/youtube_dlc/extractor/senateisvp.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unsmuggle_url, +) +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class SenateISVPIE(InfoExtractor): + _COMM_MAP = [ + ['ag', '76440', 'http://ag-f.akamaihd.net'], + ['aging', '76442', 'http://aging-f.akamaihd.net'], + ['approps', '76441', 'http://approps-f.akamaihd.net'], + ['armed', '76445', 'http://armed-f.akamaihd.net'], + ['banking', '76446', 'http://banking-f.akamaihd.net'], + ['budget', '76447', 'http://budget-f.akamaihd.net'], + ['cecc', '76486', 'http://srs-f.akamaihd.net'], + ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], + ['csce', '75229', 'http://srs-f.akamaihd.net'], + ['dpc', '76590', 'http://dpc-f.akamaihd.net'], + ['energy', '76448', 'http://energy-f.akamaihd.net'], + ['epw', '76478', 'http://epw-f.akamaihd.net'], + ['ethics', '76449', 'http://ethics-f.akamaihd.net'], + ['finance', '76450', 'http://finance-f.akamaihd.net'], + ['foreign', '76451', 'http://foreign-f.akamaihd.net'], + ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], + ['help', '76452', 'http://help-f.akamaihd.net'], + ['indian', '76455', 'http://indian-f.akamaihd.net'], + ['intel', '76456', 'http://intel-f.akamaihd.net'], + ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], + ['jccic', '85180', 'http://jccic-f.akamaihd.net'], + ['jec', '76458', 'http://jec-f.akamaihd.net'], + ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], + ['rpc', '76591', 'http://rpc-f.akamaihd.net'], + ['rules', '76460', 'http://rules-f.akamaihd.net'], + ['saa', '76489', 'http://srs-f.akamaihd.net'], + ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], + ['srs', '75229', 'http://srs-f.akamaihd.net'], + ['uscc', '76487', 'http://srs-f.akamaihd.net'], + ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], + ['arch', '', 'http://ussenate-f.akamaihd.net/'] + ] + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _get_info_for_comm(self, committee): + for entry in self._COMM_MAP: + if entry[0] == committee: + return entry[1:] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'([^<]+)', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + stream_num, domain = self._get_info_for_comm(committee) + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + formats = [{ + # All parameters in the query string are necessary to prevent a 403 error + 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', + }] + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } diff --git a/youtube_dlc/extractor/sendtonews.py b/youtube_dlc/extractor/sendtonews.py new file mode 100644 index 0000000..9d96529 --- /dev/null +++ b/youtube_dlc/extractor/sendtonews.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, + update_url_query, + int_or_none, + determine_protocol, + unescapeHTML, +) + + +class SendtoNewsIE(InfoExtractor): + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' + + _TEST = { + # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', + 'info_dict': { + 'id': 'GxfCe0Zo7D-175909-5588' + }, + 'playlist_count': 8, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '240385', + 'ext': 'mp4', + 'title': 'Indians introduce Encarnacion', + 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', + 'duration': 137.898, + 'thumbnail': r're:https?://.*\.jpg$', + 'upload_date': '20170105', + 'timestamp': 1483649762, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search(r'''(?x)]+src=([\'"]) + (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? + .*\bSC=(?P[0-9a-zA-Z-]+).* + \1>''', webpage) + if mobj: + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) + + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) + + for f in info_dict['formats']: + if f.get('tbr'): + continue + tbr = int_or_none(self._search_regex( + r'/(\d+)k/', f['url'], 'bitrate', default=None)) + if not tbr: + continue + f.update({ + 'format_id': '%s-%d' % (determine_protocol(f), tbr), + 'tbr': tbr, + }) + self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) + + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'].strip(), + 'description': unescapeHTML(video.get('S_fullStory')), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dlc/extractor/servus.py b/youtube_dlc/extractor/servus.py new file mode 100644 index 0000000..9401bf2 --- /dev/null +++ b/youtube_dlc/extractor/servus.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ServusIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| + servustv\.com/videos + ) + /(?P[aA]{2}-\w+|\d+-\d+) + ''' + _TESTS = [{ + # new URL schema + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', + 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'info_dict': { + 'id': 'AA-1T6VBU5PW1W12', + 'ext': 'mp4', + 'title': 'Die Grünen aus Sicht des Volkes', + 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'thumbnail': r're:^https?://.*\.jpg', + } + }, { + # old URL schema + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = self._extract_m3u8_formats( + 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/sevenplus.py b/youtube_dlc/extractor/sevenplus.py new file mode 100644 index 0000000..84568ac --- /dev/null +++ b/youtube_dlc/extractor/sevenplus.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .brightcove import BrightcoveNewIE +from ..compat import compat_str +from ..utils import ( + try_get, + update_url_query, +) + + +class SevenPlusIE(BrightcoveNewIE): + IE_NAME = '7plus' + _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' + _TESTS = [{ + 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', + 'info_dict': { + 'id': 'MTYS7-003', + 'ext': 'mp4', + 'title': 'S7 E3 - Wind Surf', + 'description': 'md5:29c6a69f21accda7601278f81b46483d', + 'uploader_id': '5303576322001', + 'upload_date': '20171201', + 'timestamp': 1512106377, + 'series': 'Mighty Ships', + 'season_number': 7, + 'episode_number': 3, + 'episode': 'Wind Surf', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + } + }, { + 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, episode_id = re.match(self._VALID_URL, url).groups() + + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + + for source in media.get('sources', {}): + src = source.get('src') + if not src: + continue + source['src'] = update_url_query(src, {'rule': ''}) + + info = self._parse_brightcove_metadata(media, episode_id) + + content = self._download_json( + 'https://component-cdn.swm.digital/content/' + path, + episode_id, headers={ + 'market-id': 4, + }, fatal=False) or {} + for item in content.get('items', {}): + if item.get('componentData', {}).get('componentType') == 'infoPanel': + for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: + value = item.get(src_key) + if value: + info[dst_key] = value + info['series'] = try_get( + item, lambda x: x['seriesLogo']['name'], compat_str) + mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) + if mobj: + info.update({ + 'season_number': int(mobj.group(1)), + 'episode_number': int(mobj.group(2)), + 'episode': mobj.group(3), + }) + + return info diff --git a/youtube_dlc/extractor/sexu.py b/youtube_dlc/extractor/sexu.py new file mode 100644 index 0000000..3df5152 --- /dev/null +++ b/youtube_dlc/extractor/sexu.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SexuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P<id>\d+)' + _TEST = { + 'url': 'http://sexu.com/961791/', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': '961791', + 'ext': 'mp4', + 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', + 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', + 'categories': list, # NSFW + 'thumbnail': r're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + jwvideo = self._parse_json( + self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), + video_id) + + sources = jwvideo['sources'] + + formats = [{ + 'url': source['file'].replace('\\', ''), + 'format_id': source.get('label'), + 'height': int(self._search_regex( + r'^(\d+)[pP]', source.get('label', ''), 'height', + default=None)), + } for source in sources if source.get('file')] + self._sort_formats(formats) + + title = self._html_search_regex( + r'<title>([^<]+)\s*-\s*Sexu\.Com', webpage, 'title') + + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = jwvideo.get('image') + + categories_str = self._html_search_meta( + 'keywords', webpage, 'categories') + categories = ( + None if categories_str is None + else categories_str.split(',')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'formats': formats, + 'age_limit': 18, + } diff --git a/youtube_dlc/extractor/seznamzpravy.py b/youtube_dlc/extractor/seznamzpravy.py new file mode 100644 index 0000000..7a1c7e3 --- /dev/null +++ b/youtube_dlc/extractor/seznamzpravy.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + urljoin, + int_or_none, + parse_codecs, + try_get, +) + + +def _raw_id(src_url): + return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] + + +class SeznamZpravyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _TESTS = [{ + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', + 'info_dict': { + 'id': '170889', + 'ext': 'mp4', + 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'duration': 241, + 'series': 'Svět bez obalu', + }, + 'params': { + 'skip_download': True, + }, + }, { + # with Location key + 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', + 'info_dict': { + 'id': '185688', + 'ext': 'mp4', + 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'series': 'Výzva', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', + webpage)] + + def _extract_sdn_formats(self, sdn_url, video_id): + sdn_data = self._download_json(sdn_url, video_id) + + if sdn_data.get('Location'): + sdn_url = sdn_data['Location'] + sdn_data = self._download_json(sdn_url, video_id) + + formats = [] + mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} + for format_id, format_data in mp4_formats.items(): + relative_url = format_data.get('url') + if not relative_url: + continue + + try: + width, height = format_data.get('resolution') + except (TypeError, ValueError): + width, height = None, None + + f = { + 'url': urljoin(sdn_url, relative_url), + 'format_id': 'http-%s' % format_id, + 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), + 'width': int_or_none(width), + 'height': int_or_none(height), + } + f.update(parse_codecs(format_data.get('codec'))) + formats.append(f) + + pls = sdn_data.get('pls', {}) + + def get_url(format_id): + return try_get(pls, lambda x: x[format_id]['url'], compat_str) + + dash_rel_url = get_url('dash') + if dash_rel_url: + formats.extend(self._extract_mpd_formats( + urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', + fatal=False)) + + hls_rel_url = get_url('hls') + if hls_rel_url: + formats.extend(self._extract_m3u8_formats( + urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', + m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + + src = params['src'][0] + title = params['title'][0] + video_id = params.get('contentId', [_raw_id(src)])[0] + formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) + + duration = int_or_none(params.get('duration', [None])[0]) + series = params.get('series', [None])[0] + thumbnail = params.get('poster', [None])[0] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'series': series, + 'formats': formats, + } + + +class SeznamZpravyArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P\d+)' + _API_URL = 'https://apizpravy.seznam.cz/' + + _TESTS = [{ + # two videos on one page, with SDN URL + 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', + 'info_dict': { + 'id': '35990', + 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', + 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', + }, + 'playlist_count': 2, + }, { + # video with live stream URL + 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', + 'info_dict': { + 'id': '38489', + 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', + 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', + }, + 'playlist_count': 1, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + info = self._search_json_ld(webpage, article_id, default={}) + + title = info.get('title') or self._og_search_title(webpage, fatal=False) + description = info.get('description') or self._og_search_description(webpage) + + return self.playlist_result([ + self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) + for entry_url in SeznamZpravyIE._extract_urls(webpage)], + article_id, title, description) diff --git a/youtube_dlc/extractor/shahid.py b/youtube_dlc/extractor/shahid.py new file mode 100644 index 0000000..5c2a620 --- /dev/null +++ b/youtube_dlc/extractor/shahid.py @@ -0,0 +1,215 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import math +import re + +from .aws import AWSIE +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + InAdvancePagedList, + int_or_none, + parse_iso8601, + str_or_none, + urlencode_postdata, +) + + +class ShahidBaseIE(AWSIE): + _AWS_PROXY_HOST = 'api2.shahid.net' + _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' + + def _handle_error(self, e): + fail_data = self._parse_json( + e.cause.read().decode('utf-8'), None, fatal=False) + if fail_data: + faults = fail_data.get('faults', []) + faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) + if faults_message: + raise ExtractorError(faults_message, expected=True) + + def _call_api(self, path, video_id, request=None): + query = {} + if request: + query['request'] = json.dumps(request) + try: + return self._aws_execute_api({ + 'uri': '/proxy/v2/' + path, + 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', + 'secret_key': '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', + }, video_id, query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + +class ShahidIE(ShahidBaseIE): + _NETRC_MACHINE = 'shahid' + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'info_dict': { + 'id': '275286', + 'ext': 'mp4', + 'title': 'مجلس الشباب الموسم 1 كليب 1', + 'timestamp': 1506988800, + 'upload_date': '20171003', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', + 'only_matching': True + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', + 'only_matching': True + }] + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + try: + user_data = self._download_json( + 'https://shahid.mbc.net/wd/service/users/login', + None, 'Logging in', data=json.dumps({ + 'email': email, + 'password': password, + 'basic': 'false', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/json; charset=UTF-8', + })['user'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self._handle_error(e) + raise + + self._download_webpage( + 'https://shahid.mbc.net/populateContext', + None, 'Populate Context', data=urlencode_postdata({ + 'firstName': user_data['firstName'], + 'lastName': user_data['lastName'], + 'userName': user_data['email'], + 'csg_user_name': user_data['email'], + 'subscriberId': user_data['id'], + 'sessionId': user_data['sessionId'], + })) + + def _real_extract(self, url): + page_type, video_id = re.match(self._VALID_URL, url).groups() + if page_type == 'clip': + page_type = 'episode' + + playout = self._call_api( + 'playout/url/' + video_id, video_id)['playout'] + + if playout.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + + formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + self._sort_formats(formats) + + # video = self._call_api( + # 'product/id', video_id, { + # 'id': video_id, + # 'productType': 'ASSET', + # 'productSubType': page_type.upper() + # })['productModel'] + + response = self._download_json( + 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), + video_id, 'Downloading video JSON', query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }) + data = response.get('data', {}) + error = data.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + video = data[page_type] + title = video['title'] + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('referenceDate')), + 'categories': categories, + 'series': video.get('showTitle') or video.get('showName'), + 'season': video.get('seasonTitle'), + 'season_number': int_or_none(video.get('seasonNumber')), + 'season_id': str_or_none(video.get('seasonId')), + 'episode_number': int_or_none(video.get('number')), + 'episode_id': video_id, + 'formats': formats, + } + + +class ShahidShowIE(ShahidBaseIE): + _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', + 'info_dict': { + 'id': '79187', + 'title': 'رامز قرش البحر', + 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', + 'only_matching': True + }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + show_id = self._match_id(url) + + product = self._call_api( + 'playableAsset', show_id, {'showId': show_id})['productModel'] + playlist = product['playlist'] + playlist_id = playlist['id'] + show = product.get('show', {}) + + def page_func(page_num): + playlist = self._call_api( + 'product/playlist', show_id, { + 'playListId': playlist_id, + 'pageNumber': page_num, + 'pageSize': 30, + 'sorts': [{ + 'order': 'DESC', + 'type': 'SORTDATE' + }], + }) + for product in playlist.get('productList', {}).get('products', []): + product_url = product.get('productUrl', []).get('url') + if not product_url: + continue + yield self.url_result( + product_url, 'Shahid', + str_or_none(product.get('id')), + product.get('title')) + + entries = InAdvancePagedList( + page_func, + math.ceil(playlist['count'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, show_id, show.get('title'), show.get('description')) diff --git a/youtube_dlc/extractor/shared.py b/youtube_dlc/extractor/shared.py new file mode 100644 index 0000000..02295d1 --- /dev/null +++ b/youtube_dlc/extractor/shared.py @@ -0,0 +1,138 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote_plus, +) +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + js_to_json, + KNOWN_EXTENSIONS, + parse_filesize, + rot47, + url_or_none, + urlencode_postdata, +) + + +class SharedBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + if self._FILE_NOT_FOUND in webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + video_url = self._extract_video_url(webpage, video_id, url) + + title = self._extract_title(webpage) + filesize = int_or_none(self._extract_filesize(webpage)) + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'filesize': filesize, + 'title': title, + } + + def _extract_title(self, webpage): + return compat_b64decode(self._html_search_meta( + 'full:title', webpage, 'title')).decode('utf-8') + + def _extract_filesize(self, webpage): + return self._html_search_meta( + 'full:size', webpage, 'file size', fatal=False) + + +class SharedIE(SharedBaseIE): + IE_DESC = 'shared.sx' + _VALID_URL = r'https?://shared\.sx/(?P[\da-z]{10})' + _FILE_NOT_FOUND = '>File does not exist<' + + _TEST = { + 'url': 'http://shared.sx/0060718775', + 'md5': '106fefed92a8a2adb8c98e6a0652f49b', + 'info_dict': { + 'id': '0060718775', + 'ext': 'mp4', + 'title': 'Bmp4', + 'filesize': 1720110, + }, + } + + def _extract_video_url(self, webpage, video_id, url): + download_form = self._hidden_inputs(webpage) + + video_page = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) + + video_url = self._html_search_regex( + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') + + return video_url + + +class VivoIE(SharedBaseIE): + IE_DESC = 'vivo.sx' + _VALID_URL = r'https?://vivo\.sx/(?P[\da-z]{10})' + _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' + + _TEST = { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 515659, + }, + } + + def _extract_title(self, webpage): + title = self._html_search_regex( + r'data-name\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'title', default=None, group='title') + if title: + ext = determine_ext(title) + if ext.lower() in KNOWN_EXTENSIONS: + title = title.rpartition('.' + ext)[0] + return title + return self._og_search_title(webpage) + + def _extract_filesize(self, webpage): + return parse_filesize(self._search_regex( + r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', + webpage, 'filesize', fatal=False)) + + def _extract_video_url(self, webpage, video_id, url): + def decode_url_old(encoded_url): + return compat_b64decode(encoded_url).decode('utf-8') + + stream_url = self._search_regex( + r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'stream url', default=None, group='url') + if stream_url: + stream_url = url_or_none(decode_url_old(stream_url)) + if stream_url: + return stream_url + + def decode_url(encoded_url): + return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + + return decode_url(self._parse_json( + self._search_regex( + r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, + 'stream'), + video_id, transform_source=js_to_json)['source']) diff --git a/youtube_dlc/extractor/showroomlive.py b/youtube_dlc/extractor/showroomlive.py new file mode 100644 index 0000000..efd9d56 --- /dev/null +++ b/youtube_dlc/extractor/showroomlive.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urljoin, +) + + +class ShowRoomLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.showroom-live.com/48_Nana_Okada', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcaster_id = self._match_id(url) + + webpage = self._download_webpage(url, broadcaster_id) + + room_id = self._search_regex( + (r'SrGlobal\.roomId\s*=\s*(\d+)', + r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id') + + room = self._download_json( + urljoin(url, '/api/room/profile?room_id=%s' % room_id), + broadcaster_id) + + is_live = room.get('is_onlive') + if is_live is not True: + raise ExtractorError('%s is offline' % broadcaster_id, expected=True) + + uploader = room.get('performer_name') or broadcaster_id + title = room.get('room_name') or room.get('main_name') or uploader + + streaming_url_list = self._download_json( + urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id), + broadcaster_id)['streaming_url_list'] + + formats = [] + for stream in streaming_url_list: + stream_url = stream.get('url') + if not stream_url: + continue + stream_type = stream.get('type') + if stream_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + stream_url, broadcaster_id, ext='mp4', m3u8_id='hls', + live=True) + for f in m3u8_formats: + f['quality'] = int_or_none(stream.get('quality', 100)) + formats.extend(m3u8_formats) + elif stream_type == 'rtmp': + stream_name = stream.get('stream_name') + if not stream_name: + continue + formats.append({ + 'url': stream_url, + 'play_path': stream_name, + 'page_url': url, + 'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf', + 'rtmp_live': True, + 'ext': 'flv', + 'format_id': 'rtmp', + 'format_note': stream.get('label'), + 'quality': int_or_none(stream.get('quality', 100)), + }) + self._sort_formats(formats) + + return { + 'id': compat_str(room.get('live_id') or broadcaster_id), + 'title': self._live_title(title), + 'description': room.get('description'), + 'timestamp': int_or_none(room.get('current_live_started_at')), + 'uploader': uploader, + 'uploader_id': broadcaster_id, + 'view_count': int_or_none(room.get('view_num')), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dlc/extractor/sina.py b/youtube_dlc/extractor/sina.py new file mode 100644 index 0000000..07b766b --- /dev/null +++ b/youtube_dlc/extractor/sina.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) + + +class SinaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<video_id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| + # This is used by external sites like Weibo + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf + ) + ''' + + _TESTS = [ + { + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', + 'info_dict': { + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', + } + }, + { + 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', + 'info_dict': { + 'id': '101314253', + 'ext': 'flv', + 'title': '军方提高对朝情报监视级别', + }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + _, urlh = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'preference': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } diff --git a/youtube_dlc/extractor/sixplay.py b/youtube_dlc/extractor/sixplay.py new file mode 100644 index 0000000..7ec66ec --- /dev/null +++ b/youtube_dlc/extractor/sixplay.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) +from ..utils import ( + determine_ext, + int_or_none, + try_get, + qualities, +) + + +class SixPlayIE(InfoExtractor): + IE_NAME = '6play' + _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', + 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', + 'info_dict': { + 'id': '12041051', + 'ext': 'mp4', + 'title': 'Le but qui a marqué l\'histoire du football français !', + 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', + }, + }, { + 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', + 'only_matching': True, + }, { + 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', + 'only_matching': True, + }, { + 'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = re.search(self._VALID_URL, url).groups() + service, consumer_name = { + '6play.fr': ('6play', 'm6web'), + 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), + 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), + 'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'), + }.get(domain, ('6play', 'm6web')) + + data = self._download_json( + 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), + video_id, headers={ + 'x-customer-name': consumer_name + }, query={ + 'csa': 5, + 'with': 'clips', + }) + + clip_data = data['clips'][0] + title = clip_data['title'] + + urls = [] + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + subtitles = {} + assets = clip_data.get('assets') or [] + for asset in assets: + asset_url = asset.get('full_physical_path') + protocol = asset.get('protocol') + if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: + continue + urls.append(asset_url) + container = asset.get('video_container') + ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue + if container == 'm3u8' or ext == 'm3u8': + if protocol == 'usp': + if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: + urlh = self._request_webpage( + asset_url, video_id, fatal=False, + headers=self.geo_verification_headers()) + if not urlh: + continue + asset_url = urlh.geturl() + asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') + for i in range(3, 0, -1): + asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) + m3u8_formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + formats.extend(self._extract_mpd_formats( + asset_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) + if m3u8_formats: + break + else: + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif container == 'mp4' or ext == 'mp4': + quality = asset.get('video_quality') + formats.append({ + 'url': asset_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + def get(getter): + for src in (data, clip_data): + v = try_get(src, getter, compat_str) + if v: + return v + + return { + 'id': video_id, + 'title': title, + 'description': get(lambda x: x['description']), + 'duration': int_or_none(clip_data.get('duration')), + 'series': get(lambda x: x['program']['title']), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dlc/extractor/sky.py b/youtube_dlc/extractor/sky.py new file mode 100644 index 0000000..ea30d6e --- /dev/null +++ b/youtube_dlc/extractor/sky.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) + + +class SkyBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)', + webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get( + 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token.strip('"')}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': strip_or_none(self._og_search_description(webpage)), + 'ie_key': 'Ooyala', + } + + +class SkySportsIE(SkyBaseIE): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', + 'info_dict': { + 'id': 'o3eWJnNDE6l7kfNO8BOoBlRxXRQ4ANNQ', + 'ext': 'mp4', + 'title': 'Bale: It\'s our time to shine', + 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', + }, + 'add_ie': ['Ooyala'], + } + + +class SkyNewsIE(SkyBaseIE): + _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', + 'md5': 'd6327e581473cea9976a3236ded370cd', + 'info_dict': { + 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'ext': 'mp4', + 'title': 'Russian plane inspected after deadly fire', + 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + }, + 'add_ie': ['Ooyala'], + } diff --git a/youtube_dlc/extractor/skylinewebcams.py b/youtube_dlc/extractor/skylinewebcams.py new file mode 100644 index 0000000..b7f8ac7 --- /dev/null +++ b/youtube_dlc/extractor/skylinewebcams.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkylineWebcamsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P<id>[^/]+)\.html' + _TEST = { + 'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html', + 'info_dict': { + 'id': 'scalinata-piazza-di-spagna-barcaccia', + 'ext': 'mp4', + 'title': 're:^Live Webcam Scalinata di Piazza di Spagna - La Barcaccia [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'Roma, veduta sulla Scalinata di Piazza di Spagna e sulla Barcaccia', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + stream_url = self._search_regex( + r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, + 'stream url', group='url') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + 'id': video_id, + 'url': stream_url, + 'ext': 'mp4', + 'title': self._live_title(title), + 'description': description, + 'is_live': True, + } diff --git a/youtube_dlc/extractor/skynewsarabia.py b/youtube_dlc/extractor/skynewsarabia.py new file mode 100644 index 0000000..fffc9aa --- /dev/null +++ b/youtube_dlc/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewsArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:article' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dlc/extractor/slideshare.py b/youtube_dlc/extractor/slideshare.py new file mode 100644 index 0000000..e89ebeb --- /dev/null +++ b/youtube_dlc/extractor/slideshare.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) +from ..utils import ( + ExtractorError, + get_element_by_id, +) + + +class SlideshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' + + _TEST = { + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + slideshare_obj = self._search_regex( + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', + webpage, 'slideshare object') + info = json.loads(slideshare_obj) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + + doc = info['doc'] + bucket = info['jsplayer']['video_bucket'] + ext = info['jsplayer']['video_extension'] + video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( + r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, + 'description', fatal=False) + + return { + '_type': 'video', + 'id': info['slideshow']['id'], + 'title': info['slideshow']['title'], + 'ext': ext, + 'url': video_url, + 'thumbnail': info['slideshow']['pin_image_url'], + 'description': description.strip() if description else None, + } diff --git a/youtube_dlc/extractor/slideslive.py b/youtube_dlc/extractor/slideslive.py new file mode 100644 index 0000000..d9ea768 --- /dev/null +++ b/youtube_dlc/extractor/slideslive.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class SlidesLiveIE(InfoExtractor): + _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' + _TESTS = [{ + # video_service_name = YOUTUBE + 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', + 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', + 'info_dict': { + 'id': 'LMtgR8ba0b0', + 'ext': 'mp4', + 'title': 'GCC IA16 backend', + 'description': 'Watch full version of this video at https://slideslive.com/38902413.', + 'uploader': 'SlidesLive Videos - A', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20170925', + } + }, { + # video_service_name = youtube + 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', + 'only_matching': True, + }, { + # video_service_name = url + 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', + 'only_matching': True, + }, { + # video_service_name = vimeo + 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://ben.slideslive.com/player/' + video_id, video_id) + service_name = video_data['video_service_name'].lower() + assert service_name in ('url', 'vimeo', 'youtube') + service_id = video_data['video_service_id'] + info = { + 'id': video_id, + 'thumbnail': video_data.get('thumbnail'), + 'url': service_id, + } + if service_name == 'url': + info['title'] = video_data['title'] + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': service_name.capitalize(), + 'title': video_data.get('title'), + }) + if service_name == 'vimeo': + info['url'] = smuggle_url( + 'https://player.vimeo.com/video/' + service_id, + {'http_headers': {'Referer': url}}) + return info diff --git a/youtube_dlc/extractor/slutload.py b/youtube_dlc/extractor/slutload.py new file mode 100644 index 0000000..661f9e5 --- /dev/null +++ b/youtube_dlc/extractor/slutload.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SlutloadIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', + 'md5': '868309628ba00fd488cf516a113fd717', + 'info_dict': { + 'id': 'TD73btpBqSxc', + 'ext': 'mp4', + 'title': 'virginie baisee en cam', + 'age_limit': 18, + 'thumbnail': r're:https?://.*?\.jpg' + }, + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) + + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') + + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } + + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': title, + 'age_limit': 18, + }) + return info diff --git a/youtube_dlc/extractor/smotri.py b/youtube_dlc/extractor/smotri.py new file mode 100644 index 0000000..45995f3 --- /dev/null +++ b/youtube_dlc/extractor/smotri.py @@ -0,0 +1,416 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json +import hashlib +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + sanitized_Request, + unified_strdate, + urlencode_postdata, + xpath_text, +) + + +class SmotriIE(InfoExtractor): + IE_DESC = 'Smotri.com' + IE_NAME = 'smotri' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _NETRC_MACHINE = 'smotri' + + _TESTS = [ + # real video id 2610366 + { + 'url': 'http://smotri.com/video/view/?id=v261036632ab', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', + 'info_dict': { + 'id': 'v261036632ab', + 'ext': 'mp4', + 'title': 'катастрофа с камер видеонаблюдения', + 'uploader': 'rbc2008', + 'uploader_id': 'rbc08', + 'upload_date': '20131118', + 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', + }, + }, + # real video id 57591 + { + 'url': 'http://smotri.com/video/view/?id=v57591cb20', + 'md5': '830266dfc21f077eac5afd1883091bcd', + 'info_dict': { + 'id': 'v57591cb20', + 'ext': 'flv', + 'title': 'test', + 'uploader': 'Support Photofile@photofile', + 'uploader_id': 'support-photofile', + 'upload_date': '20070704', + 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', + }, + }, + # video-password, not approved by moderator + { + 'url': 'http://smotri.com/video/view/?id=v1390466a13c', + 'md5': 'f6331cef33cad65a0815ee482a54440b', + 'info_dict': { + 'id': 'v1390466a13c', + 'ext': 'mp4', + 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + 'uploader': 'timoxa40', + 'uploader_id': 'timoxa40', + 'upload_date': '20100404', + 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', + }, + 'params': { + 'videopassword': 'qwerty', + }, + 'skip': 'Video is not approved by moderator', + }, + # video-password + { + 'url': 'http://smotri.com/video/view/?id=v6984858774#', + 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', + 'info_dict': { + 'id': 'v6984858774', + 'ext': 'mp4', + 'title': 'Дача Солженицина ПАРОЛЬ 223322', + 'uploader': 'psavari1', + 'uploader_id': 'psavari1', + 'upload_date': '20081103', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'videopassword': '223322', + }, + }, + # age limit + video-password, not approved by moderator + { + 'url': 'http://smotri.com/video/view/?id=v15408898bcf', + 'md5': '91e909c9f0521adf5ee86fbe073aad70', + 'info_dict': { + 'id': 'v15408898bcf', + 'ext': 'flv', + 'title': 'этот ролик не покажут по ТВ', + 'uploader': 'zzxxx', + 'uploader_id': 'ueggb', + 'upload_date': '20101001', + 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '333' + }, + 'skip': 'Video is not approved by moderator', + }, + # age limit + video-password + { + 'url': 'http://smotri.com/video/view/?id=v7780025814', + 'md5': 'b4599b068422559374a59300c5337d72', + 'info_dict': { + 'id': 'v7780025814', + 'ext': 'mp4', + 'title': 'Sexy Beach (пароль 123)', + 'uploader': 'вАся', + 'uploader_id': 'asya_prosto', + 'upload_date': '20081218', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '123' + }, + }, + # swf player + { + 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', + 'md5': '31099eeb4bc906712c5f40092045108d', + 'info_dict': { + 'id': 'v9188090500', + 'ext': 'mp4', + 'title': 'Shakira - Don\'t Bother', + 'uploader': 'HannahL', + 'uploader_id': 'lisaha95', + 'upload_date': '20090331', + 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', + }, + }, + ] + + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', + webpage) + if mobj is not None: + return mobj.group('url') + + mobj = re.search( + r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* + <div\s+class="video_image">[^<]+</div>\s* + <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) + if mobj is not None: + return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') + + def _search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_meta(name, html, display_name) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_form = { + 'ticket': video_id, + 'video_url': '1', + 'frame_url': '1', + 'devid': 'LoadupFlashPlayer', + 'getvideoinfo': '1', + } + + video_password = self._downloader.params.get('videopassword') + if video_password: + video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() + + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + video_url = video.get('_vidURL') or video.get('_vidURL_mp4') + + if not video_url: + if video.get('_moderate_no'): + raise ExtractorError( + 'Video %s has not been approved by moderator' % video_id, expected=True) + + if video.get('error'): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + if video.get('_pass_protected') == 1: + msg = ('Invalid video password' if video_password + else 'This video is protected by a password, use the --video-password option') + raise ExtractorError(msg, expected=True) + + title = video['title'] + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) + + # Video JSON does not provide enough meta data + # We will extract some from the video web page instead + webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id + webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') + + # Warning if video is unavailable + warning = self._html_search_regex( + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, + 'warning message', default=None) + if warning is not None: + self._downloader.report_warning( + 'Video %s may not be available; smotri said: %s ' % + (video_id, warning)) + + # Adult content + if 'EroConfirmText">' in webpage: + self.report_age_confirmation() + confirm_string = self._html_search_regex( + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, + webpage, 'confirm string') + confirm_url = webpage_url + '&confirm=%s' % confirm_string + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') + adult_content = True + else: + adult_content = False + + view_count = self._html_search_regex( + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': int_or_none(view_count), + 'age_limit': 18 if adult_content else 0, + } + + +class SmotriCommunityIE(InfoExtractor): + IE_DESC = 'Smotri.com community videos' + IE_NAME = 'smotri:community' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' + _TEST = { + 'url': 'http://smotri.com/community/video/kommuna', + 'info_dict': { + 'id': 'kommuna', + }, + 'playlist_mincount': 4, + } + + def _real_extract(self, url): + community_id = self._match_id(url) + + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') + + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] + + return self.playlist_result(entries, community_id) + + +class SmotriUserIE(InfoExtractor): + IE_DESC = 'Smotri.com user videos' + IE_NAME = 'smotri:user' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' + _TESTS = [{ + 'url': 'http://smotri.com/user/inspector', + 'info_dict': { + 'id': 'inspector', + 'title': 'Inspector', + }, + 'playlist_mincount': 9, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) + + return self.playlist_result(entries, user_id, user_nickname) + + +class SmotriBroadcastIE(InfoExtractor): + IE_DESC = 'Smotri.com broadcasts' + IE_NAME = 'smotri:broadcast' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' + _NETRC_MACHINE = 'smotri' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + broadcast_id = mobj.group('id') + + broadcast_url = 'http://' + mobj.group('url') + broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') + + if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: + raise ExtractorError( + 'Broadcast %s does not exist' % broadcast_id, expected=True) + + # Adult content + if re.search('EroConfirmText">', broadcast_page) is not None: + + (username, password) = self._get_login_info() + if username is None: + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') + + login_form = { + 'login-hint53': '1', + 'confirm_erotic': '1', + 'login': username, + 'password': password, + } + + request = sanitized_Request( + broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + broadcast_page = self._download_webpage( + request, broadcast_id, 'Logging in and confirming age') + + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) + + adult_content = True + else: + adult_content = False + + ticket = self._html_search_regex( + (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), + broadcast_page, 'broadcast ticket', group='ticket') + + broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + + broadcast_password = self._downloader.params.get('videopassword') + if broadcast_password: + broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + + broadcast_json_page = self._download_webpage( + broadcast_url, broadcast_id, 'Downloading broadcast JSON') + + try: + broadcast_json = json.loads(broadcast_json_page) + + protected_broadcast = broadcast_json['_pass_protected'] == 1 + if protected_broadcast and not broadcast_password: + raise ExtractorError( + 'This broadcast is protected by a password, use the --video-password option', + expected=True) + + broadcast_offline = broadcast_json['is_play'] == 0 + if broadcast_offline: + raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) + + rtmp_url = broadcast_json['_server'] + mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) + if not mobj: + raise ExtractorError('Unexpected broadcast rtmp URL') + + broadcast_playpath = broadcast_json['_streamName'] + broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) + broadcast_thumbnail = broadcast_json.get('_imgURL') + broadcast_title = self._live_title(broadcast_json['title']) + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') + rtmp_conn = 'S:%s' % uuid.uuid4().hex + except KeyError: + if protected_broadcast: + raise ExtractorError('Bad broadcast password', expected=True) + raise ExtractorError('Unexpected broadcast JSON') + + return { + 'id': broadcast_id, + 'url': rtmp_url, + 'title': broadcast_title, + 'thumbnail': broadcast_thumbnail, + 'description': broadcast_description, + 'uploader': broadcaster_nick, + 'uploader_id': broadcaster_login, + 'age_limit': 18 if adult_content else 0, + 'ext': 'flv', + 'play_path': broadcast_playpath, + 'player_url': 'http://pics.smotri.com/broadcast_play.swf', + 'app': broadcast_app, + 'rtmp_live': True, + 'rtmp_conn': rtmp_conn, + 'is_live': True, + } diff --git a/youtube_dlc/extractor/snotr.py b/youtube_dlc/extractor/snotr.py new file mode 100644 index 0000000..f773547 --- /dev/null +++ b/youtube_dlc/extractor/snotr.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_filesize, + str_to_int, +) + + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS = [{ + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'mp4', + 'title': 'Drone flying through fireworks!', + 'duration': 248, + 'filesize_approx': 40700000, + 'description': 'A drone flying through Fourth of July Fireworks', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], + }, { + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'mp4', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize_approx': 8500000, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] + + view_count = str_to_int(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', + webpage, 'view count', fatal=False)) + + duration = parse_duration(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', + webpage, 'duration', fatal=False)) + + filesize_approx = parse_filesize(self._html_search_regex( + r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', + webpage, 'filesize', fatal=False)) + + info_dict.update({ + 'id': video_id, + 'description': description, + 'title': title, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + }) + + return info_dict diff --git a/youtube_dlc/extractor/sohu.py b/youtube_dlc/extractor/sohu.py new file mode 100644 index 0000000..76b3cc6 --- /dev/null +++ b/youtube_dlc/extractor/sohu.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class SohuIE(InfoExtractor): + _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' + + # Sohu videos give different MD5 sums on Travis CI and my machine + _TESTS = [{ + 'note': 'This video is available only in Mainland China', + 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', + 'info_dict': { + 'id': '382479172', + 'ext': 'mp4', + 'title': 'MV:Far East Movement《The Illest》', + }, + 'skip': 'On available in China', + }, { + 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', + 'info_dict': { + 'id': '409385080', + 'ext': 'mp4', + 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', + } + }, { + 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Multipart video', + 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + }, + 'playlist': [{ + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] + }, { + 'note': 'Video with title containing dash', + 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', + 'info_dict': { + 'id': '78932792', + 'ext': 'mp4', + 'title': 'youtube-dlc testing video', + }, + 'params': { + 'skip_download': True + } + }] + + def _real_extract(self, url): + + def _fetch_data(vid_id, mytv=False): + if mytv: + base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' + else: + base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + + return self._download_json( + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + mytv = mobj.group('mytv') is not None + + webpage = self._download_webpage(url, video_id) + + title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) + + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) + if vid_data['play'] != 1: + if vid_data.get('status') == 12: + raise ExtractorError( + '%s said: There\'s something wrong in the video.' % self.IE_NAME, + expected=True) + else: + self.raise_geo_restricted( + '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) + + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) + + part_count = vid_data['data']['totalBlocks'] + + playlist = [] + for i in range(part_count): + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + video_url = 'newflv.sohu.ccgslb.net' + cdnId = None + retries = 0 + + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + 'rb': 1, + } + + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + + if retries > 0: + download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + + retries += 1 + if retries > 5: + raise ExtractorError('Failed to get video URL') + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': int_or_none( + try_get(data, lambda x: x['clipsBytes'][i])), + 'width': int_or_none(data.get('width')), + 'height': int_or_none(data.get('height')), + 'fps': int_or_none(data.get('fps')), + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) + + if len(playlist) == 1: + info = playlist[0] + info['id'] = video_id + else: + info = { + '_type': 'multi_video', + 'entries': playlist, + 'id': video_id, + 'title': title, + } + + return info diff --git a/youtube_dlc/extractor/sonyliv.py b/youtube_dlc/extractor/sonyliv.py new file mode 100644 index 0000000..58a8c0d --- /dev/null +++ b/youtube_dlc/extractor/sonyliv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': 'ref:5024612095001', + 'ext': 'mp4', + 'upload_date': '20170923', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '5182475815001', + 'timestamp': 1506200547, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { + 'geo_countries': ['IN'], + 'referrer': url, + }), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py new file mode 100644 index 0000000..ed70b71 --- /dev/null +++ b/youtube_dlc/extractor/soundcloud.py @@ -0,0 +1,906 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import re +import json +import random + +from .common import ( + InfoExtractor, + SearchInfoExtractor +) +from ..compat import ( + compat_HTTPError, + compat_kwargs, + compat_str, + compat_urlparse, +) +from ..utils import ( + error_to_compat_str, + ExtractorError, + float_or_none, + HEADRequest, + int_or_none, + KNOWN_EXTENSIONS, + mimetype2ext, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urlhandle_detect_ext, + sanitized_Request, +) + + +class SoundcloudEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _TEST = { + # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ + 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', + 'only_matching': True, + } + + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + + def _real_extract(self, url): + query = compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + api_url = query['url'][0] + secret_token = query.get('secret_token') + if secret_token: + api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) + return self.url_result(api_url) + + +class SoundcloudIE(InfoExtractor): + """Information extractor for soundcloud.com + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ + + _VALID_URL = r'''(?x)^(?:https?://)? + (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) + (?P<uploader>[\w\d-]+)/ + (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?P<title>[\w\d-]+)/? + (?P<token>[^?]+?)?(?:[?].*)?$) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) + (?:/?\?secret_token=(?P<secret_token>[^&]+))?) + ) + ''' + IE_NAME = 'soundcloud' + _TESTS = [ + { + 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', + 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', + 'info_dict': { + 'id': '62986583', + 'ext': 'mp3', + 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', + 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', + 'uploader': 'E.T. ExTerrestrial Music', + 'uploader_id': '1571244', + 'timestamp': 1349920598, + 'upload_date': '20121011', + 'duration': 143.216, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + } + }, + # geo-restricted + { + 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '47127627', + 'ext': 'mp3', + 'title': 'Goldrushed', + 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', + 'uploader': 'The Royal Concept', + 'uploader_id': '9615865', + 'timestamp': 1337635207, + 'upload_date': '20120521', + 'duration': 227.155, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link + { + 'url': 'https://soundcloud.com/jaimemf/youtube-dlc-test-video-a-y-baw/s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link (alt format) + { + 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'description': 'test chars: \"\'/\\ä↭', + 'uploader': 'jaimeMF', + 'uploader_id': '69767071', + 'timestamp': 1386604920, + 'upload_date': '20131209', + 'duration': 9.927, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # downloadable song + { + 'url': 'https://soundcloud.com/oddsamples/bus-brakes', + 'md5': '7624f2351f8a3b2e7cd51522496e7631', + 'info_dict': { + 'id': '128590877', + 'ext': 'mp3', + 'title': 'Bus Brakes', + 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', + 'uploader': 'oddsamples', + 'uploader_id': '73680509', + 'timestamp': 1389232924, + 'upload_date': '20140109', + 'duration': 17.346, + 'license': 'cc-by-sa', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'uploader_id': '12563093', + 'timestamp': 1504206263, + 'upload_date': '20170831', + 'duration': 7449.096, + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'uploader_id': '2366352', + 'timestamp': 1488152409, + 'upload_date': '20170226', + 'duration': 207.012, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', + 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'info_dict': { + 'id': '583011102', + 'ext': 'mp3', + 'title': 'Mezzo Valzer', + 'description': 'md5:4138d582f81866a530317bae316e8b61', + 'uploader': 'Micronie', + 'uploader_id': '3352531', + 'timestamp': 1551394171, + 'upload_date': '20190228', + 'duration': 180.157, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + # AAC HQ format available (account with active subscription needed) + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, + { + # Go+ (account with active subscription needed) + 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do', + 'only_matching': True, + }, + ] + + _API_V2_BASE = 'https://api-v2.soundcloud.com/' + _BASE_URL = 'https://soundcloud.com/' + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } + + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._store_client_id(client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._store_client_id(None) + self._update_client_id() + continue + elif non_fatal: + self._downloader.report_warning(error_to_compat_str(e)) + return False + raise + + def _real_initialize(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or "T5R4kgWS2PRf6lzLyIravUMnKlbIxQag" # 'EXLwg5lHTO2dslU5EePe3xkw0m1h86Cd' # 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + self._login() + + _USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _access_token = None + _HEADERS = {} + _NETRC_MACHINE = 'soundcloud' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def genDevId(): + def genNumBlock(): + return ''.join([str(random.randrange(10)) for i in range(6)]) + return '-'.join([genNumBlock() for i in range(4)]) + + payload = { + 'client_id': self._CLIENT_ID, + 'recaptcha_pubkey': 'null', + 'recaptcha_response': 'null', + 'credentials': { + 'identifier': username, + 'password': password + }, + 'signature': self.sign(username, password, self._CLIENT_ID), + 'device_id': genDevId(), + 'user_agent': self._USER_AGENT + } + + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(login, None) + self._access_token = response.get('session').get('access_token') + if not self._access_token: + self.report_warning('Unable to get access token, login may has failed') + else: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + + # signature generation + def sign(self, user, pw, clid): + a = 33 + i = 1 + s = 440123 + w = 117 + u = 1800000 + l = 1042 + b = 37 + k = 37 + c = 5 + n = "0763ed7314c69015fd4a0dc16bbf4b90" # _KEY + y = "8" # _REV + r = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" # _USER_AGENT + e = user # _USERNAME + t = clid # _CLIENT_ID + + d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) + p = n + y + d + r + e + t + d + n + h = p + + m = 8011470 + f = 0 + + for f in range(f, len(h)): + m = (m >> 1) + ((1 & m) << 23) + m += ord(h[f]) + m &= 16777215 + + # c is not even needed + out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) + + return out + + @classmethod + def _resolv_url(cls, url): + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + + def _extract_info_dict(self, info, full_title=None, secret_token=None): + track_id = compat_str(info['id']) + title = info['title'] + + format_urls = set() + formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token: + query['secret_token'] = secret_token + + if info.get('downloadable') and info.get('has_downloads_left'): + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'preference': 10, + }) + + def invalid_url(url): + return not url or url in format_urls + + def add_format(f, protocol, is_preview=False): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': protocol, + 'preference': -10 if preview else None, + }) + formats.append(f) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = self._download_json( + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ + 'url': stream_url, + 'ext': ext, + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) + + for f in formats: + f['vcodec'] = 'none' + + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted() + self._sort_formats(formats) + + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + track_id = mobj.group('track_id') + + query = {} + if track_id: + info_json_url = self._API_V2_BASE + 'tracks/' + track_id + full_title = track_id + token = mobj.group('secret_token') + if token: + query['secret_token'] = token + else: + full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') + token = mobj.group('token') + if token: + resolve_title += '/%s' % token + info_json_url = self._resolv_url(self._BASE_URL + resolve_title) + + info = self._download_json( + info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) + + return self._extract_info_dict(info, full_title, token) + + +class SoundcloudPlaylistBaseIE(SoundcloudIE): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }, headers=self._HEADERS) + entries = [] + for track in tracks: + track_id = str_or_none(track.get('id')) + url = track.get('permalink_url') + if not url: + if not track_id: + continue + url = self._API_V2_BASE + 'tracks/' + track_id + if token: + url += '?secret_token=' + token + entries.append(self.url_result( + url, SoundcloudIE.ie_key(), track_id)) + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) + + +class SoundcloudSetIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[:\w\d-]+)(?:/(?P<token>[^?/]+))?' + IE_NAME = 'soundcloud:set' + _TESTS = [{ + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', + 'info_dict': { + 'id': '2284613', + 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/weekly::flacmatic', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/charts-top:all-music:de', + 'only_matching': True, + }, { + 'url': 'https://soundcloud.com/discover/sets/charts-top:hiphoprap:kr', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') + token = mobj.group('token') + if token: + full_title += '/' + token + + info = self._download_json(self._resolv_url( + self._BASE_URL + full_title), full_title, headers=self._HEADERS) + + if 'errors' in info: + msgs = (compat_str(err['error_message']) for err in info['errors']) + raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + + return self._extract_set(info, token) + + +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): + def _extract_playlist(self, base_url, playlist_id, playlist_title): + # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. + # https://developers.soundcloud.com/blog/offset-pagination-deprecated + COMMON_QUERY = { + 'limit': 200, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + + entries = [] + for i in itertools.count(): + response = self._download_json( + next_href, playlist_id, + 'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS) + + collection = response['collection'] + + if not isinstance(collection, list): + collection = [] + + # Empty collection may be returned, in this case we proceed + # straight to next_href + + def resolve_entry(candidates): + for cand in candidates: + if not isinstance(cand, dict): + continue + permalink_url = url_or_none(cand.get('permalink_url')) + if not permalink_url: + continue + return self.url_result( + permalink_url, + SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, + str_or_none(cand.get('id')), cand.get('title')) + + for e in collection: + entry = resolve_entry((e, e.get('track'), e.get('playlist'))) + if entry: + entries.append(entry) + + next_href = response.get('next_href') + if not next_href: + break + + next_href = response['next_href'] + parsed_next_href = compat_urlparse.urlparse(next_href) + query = compat_urlparse.parse_qs(parsed_next_href.query) + query.update(COMMON_QUERY) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' + IE_NAME = 'soundcloud:user' + _TESTS = [{ + 'url': 'https://soundcloud.com/soft-cell-official', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (All)', + }, + 'playlist_mincount': 28, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/tracks', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Tracks)', + }, + 'playlist_mincount': 27, + }, { + 'url': 'https://soundcloud.com/soft-cell-official/albums', + 'info_dict': { + 'id': '207965082', + 'title': 'Soft Cell (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/jcv246/sets', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Sets)', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://soundcloud.com/jcv246/reposts', + 'info_dict': { + 'id': '12982173', + 'title': 'Jordi / cv (Reposts)', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/clalberg/likes', + 'info_dict': { + 'id': '11817582', + 'title': 'clalberg (Likes)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, + }] + + _BASE_URL_MAP = { + 'all': 'stream/users/%s', + 'tracks': 'users/%s/tracks', + 'albums': 'users/%s/albums', + 'sets': 'users/%s/playlists', + 'reposts': 'stream/users/%s/reposts', + 'likes': 'users/%s/likes', + 'spotlight': 'users/%s/spotlight', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader = mobj.group('user') + + user = self._download_json( + self._resolv_url(self._BASE_URL + uploader), + uploader, 'Downloading user info', headers=self._HEADERS) + + resource = mobj.group('rsrc') or 'all' + + return self._extract_playlist( + self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], + str_or_none(user.get('id')), + '%s (%s)' % (user['username'], resource.capitalize())) + + +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your text', + }, + 'playlist_mincount': 47, + }] + + def _real_extract(self, url): + track_name = self._match_id(url) + + track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', track['id'], 'track id') + + return self._extract_playlist( + self._API_V2_BASE + 'stations/%s/tracks' % track['id'], + track_id, 'Track station: %s' % track['title']) + + +class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): + _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' + IE_NAME = 'soundcloud:playlist' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/playlists/4110309', + 'info_dict': { + 'id': '4110309', + 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', + 'description': 're:.*?TILT Brass - Bowery Poetry Club', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + query = {} + token = mobj.group('token') + if token: + query['secret_token'] = token + + data = self._download_json( + self._API_V2_BASE + 'playlists/' + playlist_id, + playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) + + return self._extract_set(data, token) + + +class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): + IE_NAME = 'soundcloud:search' + IE_DESC = 'Soundcloud search' + _MAX_RESULTS = float('inf') + _TESTS = [{ + 'url': 'scsearch15:post-avant jazzcore', + 'info_dict': { + 'title': 'post-avant jazzcore', + }, + 'playlist_count': 15, + }] + + _SEARCH_KEY = 'scsearch' + _MAX_RESULTS_PER_PAGE = 200 + _DEFAULT_RESULTS_PER_PAGE = 50 + + def _get_collection(self, endpoint, collection_id, **query): + limit = min( + query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), + self._MAX_RESULTS_PER_PAGE) + query.update({ + 'limit': limit, + 'linked_partitioning': 1, + 'offset': 0, + }) + next_url = update_url_query(self._API_V2_BASE + endpoint, query) + + collected_results = 0 + + for i in itertools.count(1): + response = self._download_json( + next_url, collection_id, 'Downloading page {0}'.format(i), + 'Unable to download API page', headers=self._HEADERS) + + collection = response.get('collection', []) + if not collection: + break + + collection = list(filter(bool, collection)) + collected_results += len(collection) + + for item in collection: + yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + + if not collection or collected_results >= limit: + break + + next_url = response.get('next_href') + if not next_url: + break + + def _get_n_results(self, query, n): + tracks = self._get_collection('search/tracks', query, limit=n, q=query) + return self.playlist_result(tracks, playlist_title=query) diff --git a/youtube_dlc/extractor/soundgasm.py b/youtube_dlc/extractor/soundgasm.py new file mode 100644 index 0000000..3d78a9d --- /dev/null +++ b/youtube_dlc/extractor/soundgasm.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', + 'md5': '010082a2c802c5275bb00030743e75ad', + 'info_dict': { + 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', + 'ext': 'm4a', + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + audio_url = self._html_search_regex( + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + + description = self._html_search_regex( + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) + + return { + 'id': audio_id, + 'display_id': display_id, + 'url': audio_url, + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), + } + + +class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'info_dict': { + 'id': 'ytdl', + }, + 'playlist_count': 1, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dlc/extractor/southpark.py b/youtube_dlc/extractor/southpark.py new file mode 100644 index 0000000..da75a43 --- /dev/null +++ b/youtube_dlc/extractor/southpark.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SouthParkIE(MTVServicesInfoExtractor): + IE_NAME = 'southpark.cc.com' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'info_dict': { + 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'South Park|Bat Daded', + 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', + }, + }, { + 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', + 'only_matching': True, + }] + + +class SouthParkEsIE(SouthParkIE): + IE_NAME = 'southpark.cc.com:español' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _LANG = 'es' + + _TESTS = [{ + 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, + 'playlist_count': 4, + 'skip': 'Geo-restricted', + }] + + +class SouthParkDeIE(SouthParkIE): + IE_NAME = 'southpark.de' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', + 'info_dict': { + 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', + 'ext': 'mp4', + 'title': 'South Park|The Government Won\'t Respect My Privacy', + 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', + }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', + 'only_matching': True, + }] + + +class SouthParkNlIE(SouthParkIE): + IE_NAME = 'southpark.nl' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, + }] + + +class SouthParkDkIE(SouthParkIE): + IE_NAME = 'southparkstudios.dk' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' + _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, + }, { + 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', + 'only_matching': True, + }, { + 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', + 'only_matching': True, + }] diff --git a/youtube_dlc/extractor/spankbang.py b/youtube_dlc/extractor/spankbang.py new file mode 100644 index 0000000..61ca902 --- /dev/null +++ b/youtube_dlc/extractor/spankbang.py @@ -0,0 +1,184 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + merge_dicts, + orderedSet, + parse_duration, + parse_resolution, + str_to_int, + url_or_none, + urlencode_postdata, +) + + +class SpankBangIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' + _TESTS = [{ + 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', + 'md5': '1cc433e1d6aa14bc376535b8679302f7', + 'info_dict': { + 'id': '3vvn', + 'ext': 'mp4', + 'title': 'fantasy solo', + 'description': 'dillion harper masturbates on a bed', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'silly2587', + 'timestamp': 1422571989, + 'upload_date': '20150129', + 'age_limit': 18, + } + }, { + # 480p only + 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', + 'only_matching': True, + }, { + # no uploader + 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', + 'only_matching': True, + }, { + # mobile page + 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', + 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', + 'only_matching': True, + }, { + 'url': 'https://m.spankbang.com/3vvn/play', + 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2y3td/embed/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('/%s/embed' % video_id, '/%s/video' % video_id), + video_id, headers={'Cookie': 'country=US'}) + + if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + + formats = [] + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return + f = parse_resolution(format_id) + ext = determine_ext(f_url) + if format_id.startswith('m3u8') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id.startswith('mpd') or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp4' or f.get('width') or f.get('height'): + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + for format_id, format_url in stream.items(): + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format(format_id, format_url) + + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) + + info = self._search_json_ld(webpage, video_id, default={}) + + title = self._html_search_regex( + r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None) + description = self._search_regex( + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail(webpage, default=None) + uploader = self._html_search_regex( + (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>', + r'class="user"[^>]*><img[^>]+>([^<]+)'), + webpage, 'uploader', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', default=None)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) + + age_limit = self._rta_search(webpage) + + return merge_dicts({ + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': age_limit, + }, info + ) + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties', + }, + 'playlist_mincount': 50, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage( + url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) + + entries = [self.url_result( + 'https://spankbang.com/%s/video' % video_id, + ie=SpankBangIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + + title = self._html_search_regex( + r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + fatal=False) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dlc/extractor/spankwire.py b/youtube_dlc/extractor/spankwire.py new file mode 100644 index 0000000..35ab9ec --- /dev/null +++ b/youtube_dlc/extractor/spankwire.py @@ -0,0 +1,182 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + merge_dicts, + str_or_none, + str_to_int, + url_or_none, +) + + +class SpankwireIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P<id>\d+) + ''' + _TESTS = [{ + # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', + 'info_dict': { + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + # download URL pattern: */mp4_<format_id>_<video_id>.mp4 + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) + + title = video['title'] + + formats = [] + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, + }) + + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries + + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), + 'view_count': view_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, + 'formats': formats, + }, info) diff --git a/youtube_dlc/extractor/spiegel.py b/youtube_dlc/extractor/spiegel.py new file mode 100644 index 0000000..4df7f4d --- /dev/null +++ b/youtube_dlc/extractor/spiegel.py @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + strip_or_none, + unified_timestamp, +) + + +class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _TESTS = [{ + 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', + 'md5': 'b57399839d055fccfeb9a0455c439868', + 'info_dict': { + 'id': '563747', + 'ext': 'mp4', + 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'description': 'md5:8029d8310232196eb235d27575a8b9f4', + 'duration': 49, + 'upload_date': '20130311', + 'timestamp': 1362994320, + }, + }, { + 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', + 'info_dict': { + 'id': '580988', + 'ext': 'mp4', + 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', + 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', + 'duration': 983, + 'upload_date': '20131115', + 'timestamp': 1384546642, + }, + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', + 'md5': '97b91083a672d72976faa8433430afb9', + 'info_dict': { + 'id': '601883', + 'ext': 'mp4', + 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', + 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', + 'timestamp': 1409834160, + } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, + }, { + # nexx video + 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) + + # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html + if SpiegeltvIE.suitable(handle.geturl()): + return self.url_result(handle.geturl(), 'Spiegeltv') + + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), + 'title': title, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), + } + + +class SpiegelArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' + IE_NAME = 'Spiegel:Article' + IE_DESC = 'Articles on spiegel.de' + _TESTS = [{ + 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', + 'info_dict': { + 'id': '1516455', + 'ext': 'mp4', + 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', + 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', + }, + }, { + 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', + 'info_dict': { + + }, + 'playlist_count': 6, + }, { + # Nexx iFrame embed + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Single video on top of the page + video_link = self._search_regex( + r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, + 'video page URL', default=None) + if video_link: + video_url = compat_urlparse.urljoin( + self.http_scheme() + '//spiegel.de/', video_link) + return self.url_result(video_url) + + # Multiple embedded videos + embeds = re.findall( + r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', + webpage) + entries = [ + self.url_result(compat_urlparse.urljoin( + self.http_scheme() + '//spiegel.de/', embed_path)) + for embed_path in embeds] + if embeds: + return self.playlist_result(entries) + + return self.playlist_from_matches( + NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/youtube_dlc/extractor/spiegeltv.py b/youtube_dlc/extractor/spiegeltv.py new file mode 100644 index 0000000..6ccf4c3 --- /dev/null +++ b/youtube_dlc/extractor/spiegeltv.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .nexx import NexxIE + + +class SpiegeltvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + 'https://api.nexx.cloud/v3/748/videos/byid/%s' + % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dlc/extractor/spike.py b/youtube_dlc/extractor/spike.py new file mode 100644 index 0000000..3cee331 --- /dev/null +++ b/youtube_dlc/extractor/spike.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class BellatorIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', + 'info_dict': { + 'title': 'Michael Page vs. Evangelista Cyborg', + 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] + + _FEED_URL = 'http://www.bellator.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage, url): + mgid = None + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + + if not mgid: + mgid = self._extract_new_triforce_mgid(webpage, url) + + return mgid + +# TODO Remove - Reason: Outdated Site + + +class ParamountNetworkIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ + 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', + 'info_dict': { + 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', + 'ext': 'mp4', + 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', + 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' + _GEO_COUNTRIES = ['US'] + + def _extract_mgid(self, webpage, url): + root_data = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+})', + webpage, 'data'), None) + + def find_sub_data(data, data_type): + return next(c for c in data['children'] if c.get('type') == data_type) + + c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') + return c['props']['media']['video']['config']['uri'] diff --git a/youtube_dlc/extractor/sport5.py b/youtube_dlc/extractor/sport5.py new file mode 100644 index 0000000..a417b5a --- /dev/null +++ b/youtube_dlc/extractor/sport5.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') + + webpage = self._download_webpage(url, media_id) + + video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') + + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) + + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) + + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) + + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'duration': duration, + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/sportbox.py b/youtube_dlc/extractor/sportbox.py new file mode 100644 index 0000000..b9017fd --- /dev/null +++ b/youtube_dlc/extractor/sportbox.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + merge_dicts, +) + + +class SportBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', + 'info_dict': { + 'id': '109158', + 'ext': 'mp4', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 292, + 'view_count': int, + 'timestamp': 1426237001, + 'upload_date': '20150313', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', + 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/193095', + 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/109158', + 'only_matching': True, + }, { + 'url': 'https://matchtv.ru/vdl/player/media/109158', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + sources = self._parse_json( + self._search_regex( + r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', + webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + src = source.get('src') + if not src: + continue + if determine_ext(src) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + player = self._parse_json( + self._search_regex( + r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, + 'player options', default='{}'), + video_id, transform_source=js_to_json) + media_id = player['mediaId'] + + info = self._search_json_ld(webpage, media_id, default={}) + + view_count = int_or_none(self._search_regex( + r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) + + return merge_dicts(info, { + 'id': media_id, + 'title': self._og_search_title(webpage, default=None) or media_id, + 'thumbnail': player.get('poster'), + 'duration': int_or_none(player.get('duration')), + 'view_count': view_count, + 'formats': formats, + }) diff --git a/youtube_dlc/extractor/sportdeutschland.py b/youtube_dlc/extractor/sportdeutschland.py new file mode 100644 index 0000000..378fc75 --- /dev/null +++ b/youtube_dlc/extractor/sportdeutschland.py @@ -0,0 +1,82 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + sanitized_Request, +) + + +class SportDeutschlandIE(InfoExtractor): + _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' + _TESTS = [{ + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'ext': 'mp4', + 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', + 'categories': ['Badminton-Deutschland'], + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': int, + 'upload_date': '20200201', + 'description': 're:.*', # meaningless description for THIS video + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + sport_id = mobj.group('sport') + + api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( + sport_id, video_id) + req = sanitized_Request(api_url, headers={ + 'Accept': 'application/vnd.vidibus.v2.html+json', + 'Referer': url, + }) + data = self._download_json(req, video_id) + + asset = data['asset'] + categories = [data['section']['title']] + + formats = [] + smil_url = asset['video'] + if '.smil' in smil_url: + m3u8_url = smil_url.replace('.smil', '.m3u8') + formats.extend( + self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) + + smil_doc = self._download_xml( + smil_url, video_id, note='Downloading SMIL metadata') + base_url_el = smil_doc.find('./head/meta') + if base_url_el: + base_url = base_url_el.attrib['base'] + formats.extend([{ + 'format_id': 'rmtp', + 'url': base_url if base_url_el else n.attrib['src'], + 'play_path': n.attrib['src'], + 'ext': 'flv', + 'preference': -100, + 'format_note': 'Seems to fail at example stream', + } for n in smil_doc.findall('./body/video')]) + else: + formats.append({'url': smil_url}) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': asset['title'], + 'thumbnail': asset.get('image'), + 'description': asset.get('teaser'), + 'duration': asset.get('duration'), + 'categories': categories, + 'view_count': asset.get('views'), + 'rtmp_live': asset.get('live'), + 'timestamp': parse_iso8601(asset.get('date')), + } diff --git a/youtube_dlc/extractor/springboardplatform.py b/youtube_dlc/extractor/springboardplatform.py new file mode 100644 index 0000000..07d99b5 --- /dev/null +++ b/youtube_dlc/extractor/springboardplatform.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + xpath_attr, + xpath_text, + xpath_element, + unescapeHTML, + unified_timestamp, +) + + +class SpringboardPlatformIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + cms\.springboardplatform\.com/ + (?: + (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| + xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) + ) + ''' + _TESTS = [{ + 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', + 'md5': '5c3cb7b5c55740d482561099e920f192', + 'info_dict': { + 'id': '981017', + 'ext': 'mp4', + 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1409132328, + 'upload_date': '20140827', + 'duration': 193, + }, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', + 'only_matching': True, + }, { + 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + index = mobj.group('index') or mobj.group('index_2') + + video = self._download_xml( + 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' + % (index, video_id), video_id) + + item = xpath_element(video, './/item', 'item', fatal=True) + + content = xpath_element( + item, './{http://search.yahoo.com/mrss/}content', 'content', + fatal=True) + title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) + + video_url = content.attrib['url'] + + if 'error_video.mp4' in video_url: + raise ExtractorError( + 'Video %s no longer exists' % video_id, expected=True) + + duration = int_or_none(content.get('duration')) + tbr = int_or_none(content.get('bitrate')) + filesize = int_or_none(content.get('fileSize')) + width = int_or_none(content.get('width')) + height = int_or_none(content.get('height')) + + description = unescapeHTML(xpath_text( + item, './description', 'description')) + thumbnail = xpath_attr( + item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', + 'thumbnail') + + timestamp = unified_timestamp(xpath_text( + item, './{http://cms.springboardplatform.com/namespaces.html}created', + 'timestamp')) + + formats = [{ + 'url': video_url, + 'format_id': 'http', + 'tbr': tbr, + 'filesize': filesize, + 'width': width, + 'height': height, + }] + + m3u8_format = formats[0].copy() + m3u8_format.update({ + 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', + 'ext': 'mp4', + 'format_id': 'hls', + 'protocol': 'm3u8_native', + }) + formats.append(m3u8_format) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/sprout.py b/youtube_dlc/extractor/sprout.py new file mode 100644 index 0000000..8467bf4 --- /dev/null +++ b/youtube_dlc/extractor/sprout.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + update_url_query, + smuggle_url, +) + + +class SproutIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'md5': '74bf14128578d1e040c3ebc82088f45f', + 'info_dict': { + 'id': '9dexnwtmh8_X', + 'ext': 'mp4', + 'title': 'A Cowboy Adventure', + 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', + 'timestamp': 1437758640, + 'upload_date': '20150724', + 'uploader': 'NBCU-SPROUT-NEW', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_component = self._search_regex( + r'(?s)(<div[^>]+data-component="video"[^>]*?>)', + webpage, 'video component', default=None) + if video_component: + options = self._parse_json(extract_attributes( + video_component)['data-options'], video_id) + theplatform_url = options['video'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if options.get('protected'): + query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') + theplatform_url = smuggle_url(update_url_query( + theplatform_url, query), {'force_smil_url': True}) + else: + iframe = self._search_regex( + r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)', + webpage, 'iframe') + theplatform_url = extract_attributes(iframe)['src'] + + return self.url_result(theplatform_url, 'ThePlatform') diff --git a/youtube_dlc/extractor/srgssr.py b/youtube_dlc/extractor/srgssr.py new file mode 100644 index 0000000..f63a135 --- /dev/null +++ b/youtube_dlc/extractor/srgssr.py @@ -0,0 +1,192 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + ExtractorError, + parse_iso8601, + qualities, +) + + +class SRGSSRIE(InfoExtractor): + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] + + _ERRORS = { + 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', + 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', + # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', + 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', + 'LEGAL': 'The video cannot be transmitted for legal reasons.', + 'STARTDATE': 'This video is not yet available. Please try again later.', + } + + def _get_tokenized_src(self, url, video_id, format_id): + sp = compat_urllib_parse_urlparse(url).path.split('/') + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = token.get('token', {}).get('authparams') + if auth_params: + url += '?' + auth_params + return url + + def get_media_data(self, bu, media_type, media_id): + media_data = self._download_json( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), + media_id)[media_type.capitalize()] + + if media_data.get('block') and media_data['block'] in self._ERRORS: + message = self._ERRORS[media_data['block']] + if media_data['block'] == 'GEOBLOCK': + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + + return media_data + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + + media_data = self.get_media_data(bu, media_type, media_id) + + metadata = media_data['AssetMetadatas']['AssetMetadata'][0] + title = metadata['title'] + description = metadata.get('description') + created_date = media_data.get('createdDate') or metadata.get('createdDate') + timestamp = parse_iso8601(created_date) + + thumbnails = [{ + 'id': image.get('id'), + 'url': image['url'], + } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] + + preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) + formats = [] + for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): + protocol = source.get('@protocol') + for asset in source['url']: + asset_url = asset['text'] + quality = asset['@quality'] + format_id = '%s-%s' % (protocol, quality) + if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): + asset_url = self._get_tokenized_src(asset_url, media_id, format_id) + if protocol.startswith('HTTP-HDS'): + formats.extend(self._extract_f4m_formats( + asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + elif protocol.startswith('HTTP-HLS'): + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': asset_url, + 'preference': preference(quality), + 'ext': 'flv' if protocol == 'RTMP' else None, + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class SRGSSRPlayIE(InfoExtractor): + IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|play)\.)? + (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ + (?: + [^/]+/(?P<type>video|audio)/[^?]+| + popup(?P<type_2>video|audio)player + ) + \?.*?\b(?:id=|urn=urn:[^:]+:video:)(?P<id>[0-9a-f\-]{36}|\d+) + ''' + + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'ext': 'mp4', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': '0a274ce38fda48c53c01890651985bc6', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'description': 'md5:88604432b60d5a38787f152dec89cd56', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'upload_date': '20151013', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'timestamp': 1444750398, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', + 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', + 'info_dict': { + 'id': '6348260', + 'display_id': '6348260', + 'ext': 'mp4', + 'duration': 1796, + 'title': 'Le 19h30', + 'description': '', + 'uploader': '19h30', + 'upload_date': '20141201', + 'timestamp': 1417458600, + 'thumbnail': r're:^https?://.*\.image', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', + 'only_matching': True, + }, { + 'url': 'https://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?urn=urn:srf:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }, { + 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + bu = mobj.group('bu') + media_type = mobj.group('type') or mobj.group('type_2') + media_id = mobj.group('id') + # other info can be extracted from url + '&layout=json' + return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dlc/extractor/srmediathek.py b/youtube_dlc/extractor/srmediathek.py new file mode 100644 index 0000000..359dada --- /dev/null +++ b/youtube_dlc/extractor/srmediathek.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .ard import ARDMediathekBaseIE +from ..utils import ( + ExtractorError, + get_element_by_attribute, +) + + +class SRMediathekIE(ARDMediathekBaseIE): + IE_NAME = 'sr:mediathek' + IE_DESC = 'Saarländischer Rundfunk' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', + 'info_dict': { + 'id': '28455', + 'ext': 'mp4', + 'title': 'sportarena (26.10.2014)', + 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'skip': 'no longer available', + }, { + 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', + 'info_dict': { + 'id': '37682', + 'ext': 'mp4', + 'title': 'Love, Cakes and Rock\'n\'Roll', + 'description': 'md5:18bf9763631c7d326c22603681e1123d', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + raise ExtractorError('Video %s is no longer available' % video_id, expected=True) + + media_collection_url = self._search_regex( + r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') + info = self._extract_media_info(media_collection_url, webpage, video_id) + info.update({ + 'id': video_id, + 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + }) + return info diff --git a/youtube_dlc/extractor/stanfordoc.py b/youtube_dlc/extractor/stanfordoc.py new file mode 100644 index 0000000..ae3dd13 --- /dev/null +++ b/youtube_dlc/extractor/stanfordoc.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, +) + + +class StanfordOpenClassroomIE(InfoExtractor): + IE_NAME = 'stanfordoc' + IE_DESC = 'Stanford Open ClassRoom' + _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _TEST = { + 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', + 'md5': '544a9468546059d4e80d76265b0443b8', + 'info_dict': { + 'id': 'PracticalUnix_intro-environment', + 'ext': 'mp4', + 'title': 'Intro Environment', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + if mobj.group('course') and mobj.group('video'): # A specific video + course = mobj.group('course') + video = mobj.group('video') + info = { + 'id': course + '_' + video, + 'uploader': None, + 'upload_date': None, + } + + baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' + xmlUrl = baseUrl + video + '.xml' + mdoc = self._download_xml(xmlUrl, info['id']) + try: + info['title'] = mdoc.findall('./title')[0].text + info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text + except IndexError: + raise ExtractorError('Invalid metadata XML file') + return info + elif mobj.group('course'): # A course page + course = mobj.group('course') + info = { + 'id': course, + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + + coursepage = self._download_webpage( + url, info['id'], + note='Downloading course info page', + errnote='Unable to download course info page') + + info['title'] = self._html_search_regex( + r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) + + info['description'] = self._html_search_regex( + r'(?s)<description>([^<]+)</description>', + coursepage, 'description', fatal=False) + + links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info + else: # Root page + info = { + 'id': 'Stanford OpenClassroom', + '_type': 'playlist', + 'uploader': None, + 'upload_date': None, + } + info['title'] = info['id'] + + rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' + rootpage = self._download_webpage(rootURL, info['id'], + errnote='Unable to download course info page') + + links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) + info['entries'] = [self.url_result( + 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) + ) for l in links] + return info diff --git a/youtube_dlc/extractor/steam.py b/youtube_dlc/extractor/steam.py new file mode 100644 index 0000000..a6a191c --- /dev/null +++ b/youtube_dlc/extractor/steam.py @@ -0,0 +1,149 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + ExtractorError, + get_element_by_class, + js_to_json, +) + + +class SteamIE(InfoExtractor): + _VALID_URL = r"""(?x) + https?://store\.steampowered\.com/ + (agecheck/)? + (?P<urltype>video|app)/ #If the page is only for videos or for a game + (?P<gameID>\d+)/? + (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID + | + https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+) + """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' + _TESTS = [{ + 'url': 'http://store.steampowered.com/video/105600/', + 'playlist': [ + { + 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', + 'info_dict': { + 'id': '2040428', + 'ext': 'mp4', + 'title': 'Terraria 1.3 Trailer', + 'playlist_index': 1, + } + }, + { + 'md5': '911672b20064ca3263fa89650ba5a7aa', + 'info_dict': { + 'id': '2029566', + 'ext': 'mp4', + 'title': 'Terraria 1.2 Trailer', + 'playlist_index': 2, + } + } + ], + 'info_dict': { + 'id': '105600', + 'title': 'Terraria', + }, + 'params': { + 'playlistend': 2, + } + }, { + 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'info_dict': { + 'id': 'X8kpJBlzD2E', + 'ext': 'mp4', + 'upload_date': '20140617', + 'title': 'FRONTIERS - Trapping', + 'description': 'md5:bf6f7f773def614054089e5769c12a6e', + 'uploader': 'AAD Productions', + 'uploader_id': 'AtomicAgeDogGames', + } + }] + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + fileID = m.group('fileID') + if fileID: + videourl = url + playlist_id = fileID + else: + gameID = m.group('gameID') + playlist_id = gameID + videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + + self._set_cookie('steampowered.com', 'mature_content', '1') + + webpage = self._download_webpage(videourl, playlist_id) + + if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % playlist_id + self.report_age_confirmation() + webpage = self._download_webpage(videourl, playlist_id) + + flash_vars = self._parse_json(self._search_regex( + r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, + 'flash vars'), playlist_id, js_to_json) + + playlist_title = None + entries = [] + if fileID: + playlist_title = get_element_by_class('workshopItemTitle', webpage) + for movie in flash_vars.values(): + if not movie: + continue + youtube_id = movie.get('YOUTUBE_VIDEO_ID') + if not youtube_id: + continue + entries.append({ + '_type': 'url', + 'url': youtube_id, + 'ie_key': 'Youtube', + }) + else: + playlist_title = get_element_by_class('apphub_AppName', webpage) + for movie_id, movie in flash_vars.items(): + if not movie: + continue + video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) + title = movie.get('MOVIE_NAME') + if not title or not video_id: + continue + entry = { + 'id': video_id, + 'title': title.replace('+', ' '), + } + formats = [] + flv_url = movie.get('FILENAME') + if flv_url: + formats.append({ + 'format_id': 'flv', + 'url': flv_url, + }) + highlight_element = self._search_regex( + r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, + webpage, 'highlight element', fatal=False) + if highlight_element: + highlight_attribs = extract_attributes(highlight_element) + if highlight_attribs: + entry['thumbnail'] = highlight_attribs.get('data-poster') + for quality in ('', '-hd'): + for ext in ('webm', 'mp4'): + video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) + if video_url: + formats.append({ + 'format_id': ext + quality, + 'url': video_url, + }) + if not formats: + continue + entry['formats'] = formats + entries.append(entry) + if not entries: + raise ExtractorError('Could not find any videos') + + return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dlc/extractor/stitcher.py b/youtube_dlc/extractor/stitcher.py new file mode 100644 index 0000000..97d1ff6 --- /dev/null +++ b/youtube_dlc/extractor/stitcher.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + unescapeHTML, +) + + +class StitcherIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', + 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'info_dict': { + 'id': '40789481', + 'ext': 'mp3', + 'title': 'Machine Learning Mastery and Cancer Clusters', + 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'duration': 1604, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', + 'info_dict': { + 'id': '40846275', + 'display_id': 'the-rare-hourlong-comedy-plus', + 'ext': 'mp3', + 'title': "The CW's 'Crazy Ex-Girlfriend'", + 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', + 'duration': 2235, + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # escaped title + 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', + 'only_matching': True, + }, { + 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + audio_id = mobj.group('id') + display_id = mobj.group('display_id') or audio_id + + webpage = self._download_webpage(url, display_id) + + episode = self._parse_json( + js_to_json(self._search_regex( + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), + display_id)['config']['episode'] + + title = unescapeHTML(episode['title']) + formats = [{ + 'url': episode[episode_key], + 'ext': determine_ext(episode[episode_key]) or 'mp3', + 'vcodec': 'none', + } for episode_key in ('episodeURL',) if episode.get(episode_key)] + description = self._search_regex( + r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) + duration = int_or_none(episode.get('duration')) + thumbnail = episode.get('episodeImage') + + return { + 'id': audio_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dlc/extractor/storyfire.py b/youtube_dlc/extractor/storyfire.py new file mode 100644 index 0000000..67457cc --- /dev/null +++ b/youtube_dlc/extractor/storyfire.py @@ -0,0 +1,255 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +from .common import InfoExtractor + + +class StoryFireIE(InfoExtractor): + _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': '560953bfca81a69003cfa5e53ac8a920', + 'info_dict': { + 'id': '5df1d132b6378700117f9181', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies', + 'uploader': 'whang!', + 'upload_date': '20191212', + }, + 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist. + }, { + 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID + 'md5': '7a2dc6d60c4889edfed459c620fe690d', + 'info_dict': { + 'id': '5f1e11ecd78a57b6c702001d', + 'ext': 'm4a', + 'title': 'Weird Nintendo Prototype Leaks', + 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis', + 'timestamp': 1595808576, + 'upload_date': '20200727', + 'uploader': 'whang!', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + }, + 'params': {'format': 'bestaudio'} # Verifying audio extraction + + }] + + _aformats = { + 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10}, + 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1}, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Extracting the json blob is mandatory to proceed with extraction. + jsontext = self._html_search_regex( + r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>', + webpage, 'json_data') + + json = self._parse_json(jsontext, video_id) + + # The currentVideo field in the json is mandatory + # because it contains the only link to the m3u playlist + video = json['props']['initialState']['video']['currentVideo'] + videourl = video['vimeoVideoURL'] # Video URL is mandatory + + # Extract other fields from the json in an error tolerant fashion + # ID may be incorrect (on short URL format), correct it. + parsed_id = video.get('_id') + if parsed_id: + video_id = parsed_id + + title = video.get('title') + description = video.get('description') + + thumbnail = video.get('storyImage') + views = video.get('views') + likes = video.get('likesCount') + comments = video.get('commentsCount') + duration = video.get('videoDuration') + publishdate = video.get('publishDate') # Apparently epoch time, day only + + uploader = video.get('username') + uploader_id = video.get('hostID') + # Construct an uploader URL + uploader_url = None + if uploader_id: + uploader_url = "https://storyfire.com/user/%s/video" % uploader_id + + # Collect root playlist to determine formats + formats = self._extract_m3u8_formats( + videourl, video_id, 'mp4', 'm3u8_native') + + # Modify formats to fill in missing information about audio codecs + for format in formats: + aformat = self._aformats.get(format['format_id']) + if aformat: + format['acodec'] = aformat['acodec'] + format['abr'] = aformat['abr'] + format['preference'] = aformat['preference'] + format['ext'] = 'm4a' + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'ext': "mp4", + 'url': videourl, + 'formats': formats, + + 'thumbnail': thumbnail, + 'view_count': views, + 'like_count': likes, + 'comment_count': comments, + 'duration': duration, + 'timestamp': publishdate, + + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + + } + + +class StoryFireUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video' + _TESTS = [{ + 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video', + 'info_dict': { + 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'title': 'whang!', + }, + 'playlist_mincount': 18 + }, { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + 'title': 'McJuggerNuggets', + }, + 'playlist_mincount': 143 + + }] + + # Generator for fetching playlist items + def _enum_videos(self, baseurl, user_id, firstjson): + totalVideos = int(firstjson['videosCount']) + haveVideos = 0 + json = firstjson + + for page in itertools.count(1): + for video in json['videos']: + id = video['_id'] + url = "https://storyfire.com/video-details/%s" % id + haveVideos += 1 + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + # Are there more pages we could fetch? + if haveVideos < totalVideos: + pageurl = baseurl + ("%i" % haveVideos) + json = self._download_json(pageurl, user_id, + note='Downloading page %s' % page) + + # Are there any videos in the new json? + videos = json.get('videos') + if not videos or len(videos) == 0: + break # no videos + + else: + break # We have fetched all the videos, stop + + def _real_extract(self, url): + user_id = self._match_id(url) + + baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id + + # Download first page to ensure it can be downloaded, and get user information if available. + firstpage = baseurl + "0" + firstjson = self._download_json(firstpage, user_id) + + title = None + videos = firstjson.get('videos') + if videos and len(videos): + title = videos[1].get('username') + + return { + '_type': 'playlist', + 'entries': self._enum_videos(baseurl, user_id, firstjson), + 'id': user_id, + 'title': title, + } + + +class StoryFireSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13 + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0 # This playlist has entries, but no videos. + }, { + 'url': 'https://storyfire.com/write/series/stories/story_time', + 'info_dict': { + 'id': 'story_time', + }, + 'playlist_mincount': 10 + }] + + # Generator for returning playlist items + # This object is substantially different than the one in the user videos page above + def _enum_videos(self, jsonlist): + for video in jsonlist: + id = video['_id'] + if video.get('hasVideo'): # Boolean element + url = "https://storyfire.com/video-details/%s" % id + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'likes_count': video.get('likesCount'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id + json = self._download_json(listurl, list_id) + + return { + '_type': 'playlist', + 'entries': self._enum_videos(json), + 'id': list_id + } diff --git a/youtube_dlc/extractor/streamable.py b/youtube_dlc/extractor/streamable.py new file mode 100644 index 0000000..3472527 --- /dev/null +++ b/youtube_dlc/extractor/streamable.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': r're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, + } + ] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', + webpage) + if mobj: + return mobj.group('src') + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff --git a/youtube_dlc/extractor/streamcloud.py b/youtube_dlc/extractor/streamcloud.py new file mode 100644 index 0000000..32eb2b9 --- /dev/null +++ b/youtube_dlc/extractor/streamcloud.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + urlencode_postdata, +) + + +class StreamcloudIE(InfoExtractor): + IE_NAME = 'streamcloud.eu' + _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?' + + _TESTS = [{ + 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dlc_test_video_____________-BaW_jenozKc.mp4.html', + 'md5': '6bea4c7fa5daaacc2a946b7146286686', + 'info_dict': { + 'id': 'skp9j99s4bpz', + 'ext': 'mp4', + 'title': 'youtube-dlc test video \'/\\ ä ↭', + }, + 'skip': 'Only available from the EU' + }, { + 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'http://streamcloud.eu/%s' % video_id + + orig_webpage = self._download_webpage(url, video_id) + + if '>File Not Found<' in orig_webpage: + raise ExtractorError( + 'Video %s does not exist' % video_id, expected=True) + + fields = re.findall(r'''(?x)<input\s+ + type="(?:hidden|submit)"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', orig_webpage) + + self._sleep(6, video_id) + + webpage = self._download_webpage( + url, video_id, data=urlencode_postdata(fields), headers={ + b'Content-Type': b'application/x-www-form-urlencoded', + }) + + try: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)<', webpage, 'title') + video_url = self._search_regex( + r'file:\s*"([^"]+)"', webpage, 'video URL') + except ExtractorError: + message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', + webpage, 'message', default=None, group='message') + if message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + raise + thumbnail = self._search_regex( + r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'http_headers': { + 'Referer': url, + }, + } diff --git a/youtube_dlc/extractor/streamcz.py b/youtube_dlc/extractor/streamcz.py new file mode 100644 index 0000000..58e0b4c --- /dev/null +++ b/youtube_dlc/extractor/streamcz.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import time + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + sanitized_Request, +) + + +def _get_api_key(api_path): + if api_path.endswith('?'): + api_path = api_path[:-1] + + api_key = 'fb5f58a820353bd7095de526253c14fd' + a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600))) + return hashlib.md5(a.encode('ascii')).hexdigest() + + +class StreamCZIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)' + _API_URL = 'http://www.stream.cz/API' + + _TESTS = [{ + 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', + 'md5': '934bb6a6d220d99c010783c9719960d5', + 'info_dict': { + 'id': '765767', + 'ext': 'mp4', + 'title': 'Peklo na talíři: Éčka pro děti', + 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', + 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', + 'duration': 256, + }, + }, { + 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'info_dict': { + 'id': '10002447', + 'ext': 'mp4', + 'title': 'Kancelář Blaník: Tři roky pro Mazánka', + 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', + 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', + 'duration': 368, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_path = '/episode/%s' % video_id + + req = sanitized_Request(self._API_URL + api_path) + req.add_header('Api-Password', _get_api_key(api_path)) + data = self._download_json(req, video_id) + + formats = [] + for quality, video in enumerate(data['video_qualities']): + for f in video['formats']: + typ = f['type'].partition('/')[2] + qlabel = video.get('quality_label') + formats.append({ + 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, + 'format_id': '%s-%s' % (typ, f['quality']), + 'url': f['source'], + 'height': int_or_none(f['quality'].rstrip('p')), + 'quality': quality, + }) + self._sort_formats(formats) + + image = data.get('image') + if image: + thumbnail = self._proto_relative_url( + image.replace('{width}', '1240').replace('{height}', '697'), + scheme='http:', + ) + else: + thumbnail = None + + stream = data.get('_embedded', {}).get('stream:show', {}).get('name') + if stream: + title = '%s: %s' % (stream, data['name']) + else: + title = data['name'] + + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + 'description': data.get('web_site_text'), + 'duration': int_or_none(data.get('duration')), + 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, + } diff --git a/youtube_dlc/extractor/streetvoice.py b/youtube_dlc/extractor/streetvoice.py new file mode 100644 index 0000000..91612c7 --- /dev/null +++ b/youtube_dlc/extractor/streetvoice.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import unified_strdate + + +class StreetVoiceIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://streetvoice.com/skippylu/songs/94440/', + 'md5': '15974627fc01a29e492c98593c2fd472', + 'info_dict': { + 'id': '94440', + 'ext': 'mp3', + 'title': '輸', + 'description': 'Crispy脆樂團 - 輸', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 260, + 'upload_date': '20091018', + 'uploader': 'Crispy脆樂團', + 'uploader_id': '627810', + } + }, { + 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', + 'only_matching': True, + }] + + def _real_extract(self, url): + song_id = self._match_id(url) + + song = self._download_json( + 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') + + title = song['name'] + author = song['user']['nickname'] + + return { + 'id': song_id, + 'url': song['file'], + 'title': title, + 'description': '%s - %s' % (author, title), + 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), + 'duration': song.get('length'), + 'upload_date': unified_strdate(song.get('created_at')), + 'uploader': author, + 'uploader_id': compat_str(song['user']['id']), + } diff --git a/youtube_dlc/extractor/stretchinternet.py b/youtube_dlc/extractor/stretchinternet.py new file mode 100644 index 0000000..4dbead2 --- /dev/null +++ b/youtube_dlc/extractor/stretchinternet.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class StretchInternetIE(InfoExtractor): + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)' + _TEST = { + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', + 'info_dict': { + 'id': '573272', + 'ext': 'mp4', + 'title': 'University of Mary Wrestling vs. Upper Iowa', + 'timestamp': 1575668361, + 'upload_date': '20191206', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + event = self._download_json( + 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, + video_id)[0] + + return { + 'id': video_id, + 'title': event['title'], + 'timestamp': int_or_none(event.get('dateCreated'), 1000), + 'url': 'https://' + event['media'][0]['url'], + } diff --git a/youtube_dlc/extractor/stv.py b/youtube_dlc/extractor/stv.py new file mode 100644 index 0000000..bae8b71 --- /dev/null +++ b/youtube_dlc/extractor/stv.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + float_or_none, + int_or_none, +) + + +class STVPlayerIE(InfoExtractor): + IE_NAME = 'stv:player' + _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' + _TEST = { + 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', + 'md5': '5adf9439c31d554f8be0707c7abe7e0a', + 'info_dict': { + 'id': '5333973339001', + 'ext': 'mp4', + 'upload_date': '20170301', + 'title': '60 seconds on set with Laura Norton', + 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", + 'timestamp': 1488388054, + 'uploader_id': '1486976045', + }, + 'skip': 'this resource is unavailable outside of the UK', + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' + _PTYPE_MAP = { + 'episode': 'episodes', + 'video': 'shortform', + } + + def _real_extract(self, url): + ptype, video_id = re.match(self._VALID_URL, url).groups() + resp = self._download_json( + 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), + video_id) + + result = resp['results'] + video = result['video'] + video_id = compat_str(video['id']) + + subtitles = {} + _subtitles = result.get('_subtitles') or {} + for ext, sub_url in _subtitles.items(): + subtitles.setdefault('en', []).append({ + 'ext': 'vtt' if ext == 'webvtt' else ext, + 'url': sub_url, + }) + + programme = result.get('programme') or {} + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'description': result.get('summary'), + 'duration': float_or_none(video.get('length'), 1000), + 'subtitles': subtitles, + 'view_count': int_or_none(result.get('views')), + 'series': programme.get('name') or programme.get('shortName'), + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dlc/extractor/sunporno.py b/youtube_dlc/extractor/sunporno.py new file mode 100644 index 0000000..6805116 --- /dev/null +++ b/youtube_dlc/extractor/sunporno.py @@ -0,0 +1,79 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, + qualities, + determine_ext, +) + + +class SunPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.sunporno.com/videos/807778/', + 'md5': '507887e29033502f29dba69affeebfc9', + 'info_dict': { + 'id': '807778', + 'ext': 'mp4', + 'title': 'md5:0a400058e8105d39e35c35e7c5184164', + 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 302, + 'age_limit': 18, + } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) + + title = self._html_search_regex( + r'<title>([^<]+)', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._html_search_regex( + r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + + duration = parse_duration(self._search_regex( + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._html_search_regex( + r'class="views">(?: