[itv] Improve extraction, extract more subtitles and duration (closes #14944)

This commit is contained in:
Sergey M․ 2017-12-14 04:49:07 +07:00
parent 7974e289a1
commit 3fae11ac00
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -26,7 +26,7 @@ from ..utils import (
class ITVIE(InfoExtractor): class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB'] _GEO_COUNTRIES = ['GB']
_TEST = { _TESTS = [{
'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
'info_dict': { 'info_dict': {
'id': '2a2936a0053', 'id': '2a2936a0053',
@ -37,7 +37,11 @@ class ITVIE(InfoExtractor):
# rtmp download # rtmp download
'skip_download': True, 'skip_download': True,
}, },
} }, {
# unavailable via data-playlist-url
'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -101,6 +105,18 @@ class ITVIE(InfoExtractor):
'Content-Type': 'text/xml; charset=utf-8', 'Content-Type': 'text/xml; charset=utf-8',
'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
}) })
info = self._search_json_ld(webpage, video_id, default={})
formats = []
subtitles = {}
def extract_subtitle(sub_url):
ext = determine_ext(sub_url, 'ttml')
subtitles.setdefault('en', []).append({
'url': sub_url,
'ext': 'ttml' if ext == 'xml' else ext,
})
resp_env = self._download_xml( resp_env = self._download_xml(
params['data-playlist-url'], video_id, params['data-playlist-url'], video_id,
headers=headers, data=etree.tostring(req_env)) headers=headers, data=etree.tostring(req_env))
@ -111,13 +127,27 @@ class ITVIE(InfoExtractor):
if fault_code == 'InvalidGeoRegion': if fault_code == 'InvalidGeoRegion':
self.raise_geo_restricted( self.raise_geo_restricted(
msg=fault_string, countries=self._GEO_COUNTRIES) msg=fault_string, countries=self._GEO_COUNTRIES)
raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) elif fault_code != 'InvalidEntity':
title = xpath_text(playlist, 'EpisodeTitle', fatal=True) raise ExtractorError(
'%s said: %s' % (self.IE_NAME, fault_string), expected=True)
info.update({
'title': self._og_search_title(webpage),
'episode_title': params.get('data-video-episode'),
'series': params.get('data-video-title'),
})
else:
title = xpath_text(playlist, 'EpisodeTitle', default=None)
info.update({
'title': title,
'episode_title': title,
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duration': parse_duration(xpath_text(playlist, 'Duration')),
})
video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
media_files = xpath_element(video_element, 'MediaFiles', fatal=True) media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
rtmp_url = media_files.attrib['base'] rtmp_url = media_files.attrib['base']
formats = []
for media_file in media_files.findall('MediaFile'): for media_file in media_files.findall('MediaFile'):
play_path = xpath_text(media_file, 'URL') play_path = xpath_text(media_file, 'URL')
if not play_path: if not play_path:
@ -143,6 +173,10 @@ class ITVIE(InfoExtractor):
f['url'] = rtmp_url f['url'] = rtmp_url
formats.append(f) formats.append(f)
for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
if caption_url.text:
extract_subtitle(caption_url.text)
ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
hmac = params.get('data-video-hmac') hmac = params.get('data-video-hmac')
if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
@ -198,27 +232,22 @@ class ITVIE(InfoExtractor):
formats.append({ formats.append({
'url': href, 'url': href,
}) })
subs = video_data.get('Subtitles')
if isinstance(subs, list):
for sub in subs:
if not isinstance(sub, dict):
continue
href = sub.get('Href')
if isinstance(href, compat_str):
extract_subtitle(href)
if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration'))
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {}
for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
if not caption_url.text:
continue
ext = determine_ext(caption_url.text, 'ttml')
subtitles.setdefault('en', []).append({
'url': caption_url.text,
'ext': 'ttml' if ext == 'xml' else ext,
})
info = self._search_json_ld(webpage, video_id, default={})
info.update({ info.update({
'id': video_id, 'id': video_id,
'title': title,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'episode_title': title,
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duartion': parse_duration(xpath_text(playlist, 'Duration')),
}) })
return info return info