[khanacademy] fix extraction(closes #2887)(closes #26803)

This commit is contained in:
Remita Amine 2021-01-08 16:13:22 +01:00
parent 2c337f4e85
commit 61e669acff
2 changed files with 85 additions and 57 deletions

View file

@ -526,7 +526,10 @@ from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE from .khanacademy import (
KhanAcademyIE,
KhanAcademyUnitIE,
)
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE

View file

@ -1,82 +1,107 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, int_or_none,
parse_iso8601,
try_get,
) )
class KhanAcademyIE(InfoExtractor): class KhanAcademyBaseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])' _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
IE_NAME = 'KhanAcademy'
_TESTS = [{ def _parse_video(self, video):
'url': 'http://www.khanacademy.org/video/one-time-pad', return {
'md5': '7b391cce85e758fb94f763ddc1bbb979', '_type': 'url_transparent',
'url': video['youtubeId'],
'id': video.get('slug'),
'title': video.get('title'),
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
'duration': int_or_none(video.get('duration')),
'description': video.get('description'),
'ie_key': 'Youtube',
}
def _real_extract(self, url):
display_id = self._match_id(url)
component_props = self._parse_json(self._download_json(
'https://www.khanacademy.org/api/internal/graphql',
display_id, query={
'hash': 1604303425,
'variables': json.dumps({
'path': display_id,
'queryParams': '',
}),
})['data']['contentJson'], display_id)['componentProps']
return self._parse_component_props(component_props)
class KhanAcademyIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy'
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
'info_dict': { 'info_dict': {
'id': 'one-time-pad', 'id': 'FlIG3TvQCBQ',
'ext': 'webm', 'ext': 'mp4',
'title': 'The one-time pad', 'title': 'The one-time pad',
'description': 'The perfect cipher', 'description': 'The perfect cipher',
'duration': 176, 'duration': 176,
'uploader': 'Brit Cruise', 'uploader': 'Brit Cruise',
'uploader_id': 'khanacademy', 'uploader_id': 'khanacademy',
'upload_date': '20120411', 'upload_date': '20120411',
'timestamp': 1334170113,
'license': 'cc-by-nc-sa',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
}, { }
'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
def _parse_component_props(self, component_props):
video = component_props['tutorialPageData']['contentModel']
info = self._parse_video(video)
author_names = video.get('authorNames')
info.update({
'uploader': ', '.join(author_names) if author_names else None,
'timestamp': parse_iso8601(video.get('dateAdded')),
'license': video.get('kaUserLicense'),
})
return info
class KhanAcademyUnitIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy:unit'
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': { 'info_dict': {
'id': 'cryptography', 'id': 'cryptography',
'title': 'Journey into cryptography', 'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?', 'description': 'How have humans protected their secret messages through history? What has changed today?',
}, },
'playlist_mincount': 3, 'playlist_mincount': 31,
}] }
def _real_extract(self, url): def _parse_component_props(self, component_props):
m = re.match(self._VALID_URL, url) curation = component_props['curation']
video_id = m.group('id')
if m.group('key') == 'video': entries = []
data = self._download_json( tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
'http://api.khanacademy.org/api/v1/videos/' + video_id, for tutorial_number, tutorial in enumerate(tutorials, 1):
video_id, 'Downloading video info') chapter_info = {
'chapter': tutorial.get('title'),
upload_date = unified_strdate(data['date_added']) 'chapter_number': tutorial_number,
uploader = ', '.join(data['author_names']) 'chapter_id': tutorial.get('id'),
return {
'_type': 'url_transparent',
'url': data['url'],
'id': video_id,
'title': data['title'],
'thumbnail': data['image_url'],
'duration': data['duration'],
'description': data['description'],
'uploader': uploader,
'upload_date': upload_date,
} }
else: for content_item in (tutorial.get('contentItems') or []):
# topic if content_item.get('kind') == 'Video':
data = self._download_json( info = self._parse_video(content_item)
'http://api.khanacademy.org/api/v1/topic/' + video_id, info.update(chapter_info)
video_id, 'Downloading topic info') entries.append(info)
entries = [ return self.playlist_result(
{ entries, curation.get('unit'), curation.get('title'),
'_type': 'url', curation.get('description'))
'url': c['url'],
'id': c['id'],
'title': c['title'],
}
for c in data['children'] if c['kind'] in ('Video', 'Topic')]
return {
'_type': 'playlist',
'id': video_id,
'title': data['title'],
'description': data['description'],
'entries': entries,
}