[condenast] fix extraction and extract subtitles

2024-01-07 17:16:08 +00:00 · 2020-11-16 18:57:33 +01:00 · 2020-11-16 18:57:33 +01:00 · 9448a20312
parent 3f1748b944
commit 9448a20312
1 changed files with 23 additions and 4 deletions
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@ -16,6 +16,8 @@ from ..utils import (
    mimetype2ext,
    orderedSet,
    parse_iso8601,
+    strip_or_none,
+    try_get,
 )


@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
            'uploader': 'gq',
            'upload_date': '20170321',
            'timestamp': 1490126427,
+            'description': 'How much grimmer would things be if these people were competent?',
        },
    }, {
        # JS embed
@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
            'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
            'uploader': 'arstechnica',
            'upload_date': '20150916',
-            'timestamp': 1442434955,
+            'timestamp': 1442434920,
        }
    }, {
        'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
            })
        self._sort_formats(formats)

+        subtitles = {}
+        for t, caption in video_info.get('captions', {}).items():
+            caption_url = caption.get('src')
+            if not (t in ('vtt', 'srt', 'tml') and caption_url):
+                continue
+            subtitles.setdefault('en', []).append({'url': caption_url})
+
        return {
            'id': video_id,
            'formats': formats,
@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
            'season': video_info.get('season_title'),
            'timestamp': parse_iso8601(video_info.get('premiere_date')),
            'categories': video_info.get('categories'),
+            'subtitles': subtitles,
        }

    def _real_extract(self, url):
@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
        if url_type == 'series':
            return self._extract_series(url, webpage)
        else:
-            params = self._extract_video_params(webpage, display_id)
-            info = self._search_json_ld(
-                webpage, display_id, fatal=False)
+            video = try_get(self._parse_json(self._search_regex(
+                r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+                'preload state', '{}'), display_id),
+                lambda x: x['transformed']['video'])
+            if video:
+                params = {'videoId': video['id']}
+                info = {'description': strip_or_none(video.get('description'))}
+            else:
+                params = self._extract_video_params(webpage, display_id)
+                info = self._search_json_ld(
+                    webpage, display_id, fatal=False)
            info.update(self._extract_video(params))
            return info