[bandcamp:weekly] Improve and extract more metadata (closes #12758)

This commit is contained in:
Sergey M․ 2017-06-04 23:21:30 +07:00
parent 62bafabc09
commit 6d923aab35
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -14,6 +14,7 @@ from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
KNOWN_EXTENSIONS,
parse_filesize, parse_filesize,
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
@ -22,7 +23,7 @@ from ..utils import (
class BandcampIE(InfoExtractor): class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439', 'md5': 'c557841d5e50261777a6585648adf439',
@ -156,7 +157,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album' IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_TESTS = [{ _TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@ -225,7 +226,9 @@ class BandcampAlbumIE(InfoExtractor):
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
return False if BandcampWeeklyIE.suitable(url) else super(BandcampAlbumIE, cls).suitable(url) return (False
if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -258,16 +261,22 @@ class BandcampAlbumIE(InfoExtractor):
class BandcampWeeklyIE(InfoExtractor): class BandcampWeeklyIE(InfoExtractor):
IE_NAME = 'Bandcamp:bandcamp_weekly' IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*&)?show=(?P<id>\d+)(?:$|[&#])' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://bandcamp.com/?show=224', 'url': 'https://bandcamp.com/?show=224',
'md5': 'b00df799c733cf7e0c567ed187dea0fd', 'md5': 'b00df799c733cf7e0c567ed187dea0fd',
'info_dict': { 'info_dict': {
'id': '224', 'id': '224',
'ext': 'opus', 'ext': 'opus',
'title': 'BC Weekly April 4th 2017: Magic Moments', 'title': 'BC Weekly April 4th 2017 - Magic Moments',
'description': 'Stones Throw\'s Vex Ruffin, plus up and coming singer Salami Rose Joe Louis, in conversation about their fantastic DIY albums.', 'description': 'md5:5d48150916e8e02d030623a48512c874',
'duration': 5829.77,
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224',
} }
}, { }, {
'url': 'https://bandcamp.com/?blah/blah@&show=228', 'url': 'https://bandcamp.com/?blah/blah@&show=228',
@ -288,32 +297,53 @@ class BandcampWeeklyIE(InfoExtractor):
# This is desired because any invalid show id redirects to `bandcamp.com` # This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode. # which happens to expose the latest Bandcamp Weekly episode.
video_id = compat_str(show['show_id']) show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
def to_format_dictionaries(audio_stream): formats = []
dictionaries = [{'format_id': kvp[0], 'url': kvp[1]} for kvp in audio_stream.items()] for format_id, format_url in show['audio_stream'].items():
known_extensions = ['mp3', 'opus'] if not isinstance(format_url, compat_str):
continue
for dictionary in dictionaries: for known_ext in KNOWN_EXTENSIONS:
for ext in known_extensions: if known_ext in format_id:
if ext in dictionary['format_id']: ext = known_ext
dictionary['ext'] = ext
break break
else:
return dictionaries ext = None
formats.append({
formats = to_format_dictionaries(show['audio_stream']) 'format_id': format_id,
'url': format_url,
'ext': ext,
'vcodec': 'none',
})
self._sort_formats(formats) self._sort_formats(formats)
title = show.get('audio_title') or 'Bandcamp Weekly'
subtitle = show.get('subtitle')
if subtitle:
title += ' - %s' % subtitle
episode_number = None
seq = blob.get('bcw_seq')
if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass
return { return {
'id': video_id, 'id': video_id,
'title': show['audio_title'] + ': ' + show['subtitle'], 'title': title,
'description': show.get('desc'), 'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')), 'duration': float_or_none(show.get('audio_duration')),
'webpage_url': 'https://bandcamp.com/?show=' + video_id,
'is_live': False, 'is_live': False,
'release_date': unified_strdate(show.get('published_date')), 'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly', 'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_number': episode_number,
'episode_id': compat_str(video_id), 'episode_id': compat_str(video_id),
'formats': formats 'formats': formats
} }