From e4d6cca0c1bb987592b576bd3bd439e6ecc9b342 Mon Sep 17 00:00:00 2001 From: net Date: Wed, 1 Oct 2014 23:45:35 +0300 Subject: [PATCH 1/3] [walla] Add new extractor --- youtube_dl/extractor/walla.py | 70 +++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/walla.py diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py new file mode 100644 index 000000000..e687c3af0 --- /dev/null +++ b/youtube_dl/extractor/walla.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +import re + +from .common import InfoExtractor + + +class WallaIE(InfoExtractor): + _VALID_URL = r'http://vod\.walla\.co\.il/\w+/(?P\d+)' + _TEST = { + 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', + 'info_dict': { + 'id': '2642630', + 'ext': 'flv', + 'title': 'וואן דיירקשן: ההיסטריה', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + + config_url = 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id + + webpage = self._download_webpage(config_url, video_id, '') + + media_id = self._html_search_regex(r'(\d+)', webpage, video_id, 'extract media id') + + prefix = '0' if len(media_id) == 7 else '' + + series = '%s%s' % (prefix, media_id[0:2]) + session = media_id[2:5] + episode = media_id[5:7] + + title = self._html_search_regex(r'(.*)', webpage, video_id, 'title') + + default_quality = self._html_search_regex(r'', webpage, video_id, 0) + + quality = default_quality if default_quality else '40' + + media_path = '/%s/%s/%s' % (series, session, media_id) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) + + playpath = 'mp4:media/%s/%s/%s-%s' % (series, session, media_id, quality) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) + + subtitles = {} + + subtitle_url = self._html_search_regex(r'(.*).*', webpage, video_id, 0) + + print subtitle_url + + if subtitle_url: + subtitles_page = self._download_webpage(subtitle_url, video_id, '') + subtitles['heb'] = subtitles_page + + return { + 'id': video_id, + 'title': title, + 'url': 'rtmp://wafla.walla.co.il:1935/vod', + 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', + 'page_url': url, + 'app': "vod", + 'play_path': playpath, + 'tc_url': 'rtmp://wafla.walla.co.il:1935/vod', + 'rtmp_protocol': 'rtmp', + 'ext': 'flv', + 'subtitles': subtitles, + } \ No newline at end of file From 31d06400ecee79ecf3a0bd38c0702f165fd6d958 Mon Sep 17 00:00:00 2001 From: net Date: Mon, 6 Oct 2014 03:03:05 +0300 Subject: [PATCH 2/3] add missed init file --- youtube_dl/extractor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 079221567..b07c0b4cc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -427,6 +427,7 @@ from .vporn import VpornIE from .vube import VubeIE from .vuclip import VuClipIE from .vulture import VultureIE +from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE from .wayofthemaster import WayOfTheMasterIE From 7bc8780c576505fd87a5c85ff1f50ef2e8841d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Oct 2014 22:23:05 +0700 Subject: [PATCH 3/3] [walla] Fix extractor and add subtitle tests --- test/test_subtitles.py | 28 +++++++++++ youtube_dl/extractor/walla.py | 95 +++++++++++++++++++++-------------- 2 files changed, 85 insertions(+), 38 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 48c302198..eb5f2f8dd 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( DailymotionIE, TEDIE, VimeoIE, + WallaIE, ) @@ -279,5 +280,32 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) +class TestWallsSubtitles(BaseTestSubtitles): + url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' + IE = WallaIE + + def test_list_subtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() + self.assertEqual(info_dict, None) + + def test_allsubtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['heb'])) + self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920') + + def test_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') + self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index e687c3af0..672bda7a7 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -1,70 +1,89 @@ # coding: utf-8 from __future__ import unicode_literals - import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( + xpath_text, + int_or_none, +) -class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/\w+/(?P\d+)' +class WallaIE(SubtitlesInfoExtractor): + _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { 'id': '2642630', + 'display_id': 'one-direction-all-for-one', 'ext': 'flv', 'title': 'וואן דיירקשן: ההיסטריה', + 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, } } + _SUBTITLE_LANGS = { + 'עברית': 'heb', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + display_id = mobj.group('display_id') - config_url = 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id - - webpage = self._download_webpage(config_url, video_id, '') + video = self._download_xml( + 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, + display_id) - media_id = self._html_search_regex(r'(\d+)', webpage, video_id, 'extract media id') + item = video.find('./items/item') - prefix = '0' if len(media_id) == 7 else '' - - series = '%s%s' % (prefix, media_id[0:2]) - session = media_id[2:5] - episode = media_id[5:7] - - title = self._html_search_regex(r'(.*)', webpage, video_id, 'title') - - default_quality = self._html_search_regex(r'', webpage, video_id, 0) - - quality = default_quality if default_quality else '40' - - media_path = '/%s/%s/%s' % (series, session, media_id) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) - - playpath = 'mp4:media/%s/%s/%s-%s' % (series, session, media_id, quality) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) + title = xpath_text(item, './title', 'title') + description = xpath_text(item, './synopsis', 'description') + thumbnail = xpath_text(item, './preview_pic', 'thumbnail') + duration = int_or_none(xpath_text(item, './duration', 'duration')) subtitles = {} + for subtitle in item.findall('./subtitles/subtitle'): + lang = xpath_text(subtitle, './title') + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') - subtitle_url = self._html_search_regex(r'(.*).*', webpage, video_id, 0) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return - print subtitle_url + subtitles = self.extract_subtitles(video_id, subtitles) - if subtitle_url: - subtitles_page = self._download_webpage(subtitle_url, video_id, '') - subtitles['heb'] = subtitles_page + formats = [] + for quality in item.findall('./qualities/quality'): + format_id = xpath_text(quality, './title') + fmt = { + 'url': 'rtmp://wafla.walla.co.il/vod', + 'play_path': xpath_text(quality, './src'), + 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', + 'page_url': url, + 'ext': 'flv', + 'format_id': xpath_text(quality, './title'), + } + m = re.search(r'^(?P\d+)[Pp]', format_id) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) return { 'id': video_id, + 'display_id': display_id, 'title': title, - 'url': 'rtmp://wafla.walla.co.il:1935/vod', - 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', - 'page_url': url, - 'app': "vod", - 'play_path': playpath, - 'tc_url': 'rtmp://wafla.walla.co.il:1935/vod', - 'rtmp_protocol': 'rtmp', - 'ext': 'flv', + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, 'subtitles': subtitles, - } \ No newline at end of file + }