[youtube] Extract video titles for channel playlist if possible (Closes #4971)

This commit is contained in:
Sergey M․ 2015-04-12 23:19:00 +06:00
parent 830d53bfae
commit fb69240ca0

View file

@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor):
def extract_videos_from_page(self, page): def extract_videos_from_page(self, page):
ids_in_page = [] ids_in_page = []
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): titles_in_page = []
if mobj.group(1) not in ids_in_page: for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
ids_in_page.append(mobj.group(1)) video_id = mobj.group('id')
return ids_in_page video_title = unescapeHTML(mobj.group('title'))
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
titles_in_page[idx] = video_title
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
return zip(ids_in_page, titles_in_page)
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) channel_id = self._match_id(url)
@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor):
if autogenerated: if autogenerated:
# The videos are contained in a single page # The videos are contained in a single page
# the ajax pages can't be used, they are empty # the ajax pages can't be used, they are empty
video_ids = self.extract_videos_from_page(channel_page) videos = self.extract_videos_from_page(channel_page)
entries = [ entries = [
self.url_result(video_id, 'Youtube', video_id=video_id) self.url_result(
for video_id in video_ids] video_id, 'Youtube', video_id=video_id,
video_title=video_title)
for video_id, video_title in videos]
return self.playlist_result(entries, channel_id) return self.playlist_result(entries, channel_id)
def _entries(): def _entries():
@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor):
for pagenum in itertools.count(1): for pagenum in itertools.count(1):
ids_in_page = self.extract_videos_from_page(content_html) ids_in_page = self.extract_videos_from_page(content_html)
for video_id in ids_in_page: for video_id, video_title in ids_in_page:
yield self.url_result( yield self.url_result(
video_id, 'Youtube', video_id=video_id) video_id, 'Youtube', video_id=video_id,
video_title=video_title)
mobj = re.search( mobj = re.search(
r'data-uix-load-more-href="/?(?P<more>[^"]+)"', r'data-uix-load-more-href="/?(?P<more>[^"]+)"',