From 6c370bc149945e26c6e359147221b3aa96e0ea75 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 10 Dec 2020 14:52:35 +0100 Subject: [PATCH] [facebook] Add support for Relay based pages(closes #26823) --- youtube_dl/extractor/facebook.py | 71 +++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c23ff298e..f1a90362f 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -16,11 +16,13 @@ from ..utils import ( clean_html, error_to_compat_str, ExtractorError, + float_or_none, get_element_by_id, int_or_none, js_to_json, limit_length, parse_count, + qualities, sanitized_Request, try_get, urlencode_postdata, @@ -327,6 +329,14 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) + formats = [] + + def extract_dash_manifest(video): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', @@ -334,6 +344,61 @@ class FacebookIE(InfoExtractor): ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) + if not video_data: + graphql_data = self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', + webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} + for require in (graphql_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + def parse_graphql_video(video): + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video) + self._sort_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + return webpage, info + + data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + + attachments = try_get(data, [ + lambda x: x['video']['story']['attachments'], + lambda x: x['video']['creation_story']['attachments'], + lambda x: x['node']['comet_sections']['content']['story']['attachments'] + ], list) or [] + for attachment in attachments: + media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + video = data.get('video') or {} + if video: + return parse_graphql_video(video) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -375,7 +440,6 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot parse data') subtitles = {} - formats = [] for f in video_data: format_id = f['stream_type'] if f and isinstance(f, dict): @@ -394,10 +458,7 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - dash_manifest = f[0].get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + extract_dash_manifest(f[0]) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src})