[motherless:group] Relax entry extraction and add a fallback scenario

This commit is contained in:
Sergey M․ 2018-01-07 00:31:53 +07:00
parent a133eb7764
commit 0a5b1295b7
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor):
else super(MotherlessGroupIE, cls).suitable(url)) else super(MotherlessGroupIE, cls).suitable(url))
def _extract_entries(self, webpage, base): def _extract_entries(self, webpage, base):
return [ entries = []
for mobj in re.finditer(
r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
webpage):
video_url = compat_urlparse.urljoin(base, mobj.group('href'))
if not MotherlessIE.suitable(video_url):
continue
video_id = MotherlessIE._match_id(video_url)
title = mobj.group('title')
entries.append(self.url_result(
video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
video_title=title))
# Alternative fallback
if not entries:
entries = [
self.url_result( self.url_result(
compat_urlparse.urljoin(base, video_path), compat_urlparse.urljoin(base, '/' + video_id),
MotherlessIE.ie_key(), video_title=title) ie=MotherlessIE.ie_key(), video_id=video_id)
for video_path, title in orderedSet(re.findall( for video_id in orderedSet(re.findall(
r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', r'data-codename=["\']([A-Z0-9]+)', webpage))]
webpage)) return entries
]
def _real_extract(self, url): def _real_extract(self, url):
group_id = self._match_id(url) group_id = self._match_id(url)