mirror of
https://gitlab.com/dstftw/youtube-dl.git
synced 2020-11-16 09:42:26 +00:00
Fix MIT extractor for Python 2.6
The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing.
This commit is contained in:
parent
2891932bf0
commit
b5ba7b9dcf
|
@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor):
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
webpage = self._download_webpage(
|
raw_page = self._download_webpage(
|
||||||
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
|
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
|
||||||
embed_page = self._download_webpage(
|
clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
|
||||||
'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
|
|
||||||
note=u'Downloading embed page')
|
|
||||||
|
|
||||||
base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
|
base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
|
||||||
embed_page, u'base url')
|
raw_page, u'base url')
|
||||||
formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
|
formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
|
||||||
u'video formats')
|
u'video formats')
|
||||||
formats = json.loads(formats_json)
|
formats = json.loads(formats_json)
|
||||||
formats = sorted(formats, key=lambda f: f['bitrate'])
|
formats = sorted(formats, key=lambda f: f['bitrate'])
|
||||||
|
|
||||||
title = get_element_by_id('edit-title', webpage)
|
title = get_element_by_id('edit-title', clean_page)
|
||||||
description = clean_html(get_element_by_id('edit-description', webpage))
|
description = clean_html(get_element_by_id('edit-description', clean_page))
|
||||||
thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
|
thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
|
||||||
embed_page, u'thumbnail', flags=re.DOTALL)
|
raw_page, u'thumbnail', flags=re.DOTALL)
|
||||||
|
|
||||||
return {'id': video_id,
|
return {'id': video_id,
|
||||||
'title': title,
|
'title': title,
|
||||||
|
|
Loading…
Reference in a new issue