1
0
Fork 0
mirror of https://gitlab.com/dstftw/youtube-dl.git synced 2020-11-16 09:42:26 +00:00

[xiami] Improve extraction (Closes #9079)

* Switch to JSON source
* Add abstract IE for playlists
* Extract more track related metadata
This commit is contained in:
Sergey M․ 2016-04-30 21:50:23 +06:00
parent 89c0dc9a5f
commit 4e0c0c1508
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 96 additions and 99 deletions

View file

@ -942,7 +942,7 @@ from .xhamster import (
XHamsterEmbedIE, XHamsterEmbedIE,
) )
from .xiami import ( from .xiami import (
XiamiIE, XiamiSongIE,
XiamiAlbumIE, XiamiAlbumIE,
XiamiArtistIE, XiamiArtistIE,
XiamiCollectionIE XiamiCollectionIE

View file

@ -1,50 +1,42 @@
# -*- coding: utf-8 -*- # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
xpath_element,
xpath_text,
xpath_with_ns,
int_or_none,
ExtractorError
)
from ..compat import compat_urllib_parse_unquote from ..compat import compat_urllib_parse_unquote
from ..utils import int_or_none
class XiamiBaseIE(InfoExtractor): class XiamiBaseIE(InfoExtractor):
_API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
_XML_BASE_URL = 'http://www.xiami.com/song/playlist/id' def _extract_track(self, track, track_id=None):
_NS_MAP = {'xm': 'http://xspf.org/ns/0/'} title = track['title']
track_url = self._decrypt(track['location'])
def _extract_track(self, track): subtitles = {}
artist = xpath_text(track, xpath_with_ns('xm:artist', self._NS_MAP), default='') lyrics_url = track.get('lyric_url') or track.get('lyric')
artist = artist.split(';') if lyrics_url and lyrics_url.startswith('http'):
subtitles['origin'] = [{'url': lyrics_url}]
ret = { return {
'id': xpath_text(track, xpath_with_ns('xm:song_id', self._NS_MAP)), 'id': track.get('song_id') or track_id,
'title': xpath_text(track, xpath_with_ns('xm:title', self._NS_MAP)), 'url': track_url,
'album': xpath_text(track, xpath_with_ns('xm:album_name', self._NS_MAP)), 'title': title,
'artist': ';'.join(artist) if artist else None, 'thumbnail': track.get('pic') or track.get('album_pic'),
'creator': artist[0] if artist else None, 'duration': int_or_none(track.get('length')),
'url': self._decrypt(xpath_text(track, xpath_with_ns('xm:location', self._NS_MAP))), 'creator': track.get('artist', '').split(';')[0],
'thumbnail': xpath_text(track, xpath_with_ns('xm:pic', self._NS_MAP), default=None), 'track': title,
'duration': int_or_none(xpath_text(track, xpath_with_ns('xm:length', self._NS_MAP))), 'album': track.get('album_name'),
'artist': track.get('artist'),
'subtitles': subtitles,
} }
lyrics_url = xpath_text(track, xpath_with_ns('xm:lyric', self._NS_MAP)) def _extract_tracks(self, item_id, typ=None):
if lyrics_url and lyrics_url.endswith('.lrc'): playlist = self._download_json(
ret['description'] = self._download_webpage(lyrics_url, ret['id']) '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id)
return ret return [
self._extract_track(track, item_id)
def _extract_xml(self, _id, typ=''): for track in playlist['data']['trackList']]
playlist = self._download_xml('%s/%s%s' % (self._XML_BASE_URL, _id, typ), _id)
tracklist = xpath_element(playlist, xpath_with_ns('./xm:trackList', self._NS_MAP))
if not len(tracklist):
raise ExtractorError('No track found')
return [self._extract_track(track) for track in tracklist]
@staticmethod @staticmethod
def _decrypt(origin): def _decrypt(origin):
@ -62,75 +54,87 @@ class XiamiBaseIE(InfoExtractor):
ans = '' ans = ''
for i in range(0, short_lenth + 1): for i in range(0, short_lenth + 1):
for j in range(0, n): for j in range(0, n):
if len(l[j])>i: if len(l[j]) > i:
ans += l[j][i] ans += l[j][i]
return compat_urllib_parse_unquote(ans).replace('^', '0') return compat_urllib_parse_unquote(ans).replace('^', '0')
class XiamiIE(XiamiBaseIE): class XiamiSongIE(XiamiBaseIE):
IE_NAME = 'xiami:song' IE_NAME = 'xiami:song'
IE_DESC = '虾米音乐' IE_DESC = '虾米音乐'
_VALID_URL = r'http://www\.xiami\.com/song/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)'
_TESTS = [ _TESTS = [{
{
'url': 'http://www.xiami.com/song/1775610518', 'url': 'http://www.xiami.com/song/1775610518',
'md5': '521dd6bea40fd5c9c69f913c232cb57e', 'md5': '521dd6bea40fd5c9c69f913c232cb57e',
'info_dict': { 'info_dict': {
'id': '1775610518', 'id': '1775610518',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Woman', 'title': 'Woman',
'creator': 'HONNE',
'album': 'Woman',
'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
'description': 'md5:052ec7de41ca19f67e7fd70a1bfc4e0b', 'duration': 265,
} 'creator': 'HONNE',
'track': 'Woman',
'album': 'Woman',
'artist': 'HONNE',
'subtitles': {
'origin': [{
'ext': 'lrc',
}],
}, },
{ }
}, {
'url': 'http://www.xiami.com/song/1775256504', 'url': 'http://www.xiami.com/song/1775256504',
'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
'info_dict': { 'info_dict': {
'id': '1775256504', 'id': '1775256504',
'ext': 'mp3', 'ext': 'mp3',
'title': '悟空', 'title': '悟空',
'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
'duration': 200,
'creator': '戴荃', 'creator': '戴荃',
'track': '悟空',
'album': '悟空', 'album': '悟空',
'description': 'md5:206e67e84f9bed1d473d04196a00b990', 'artist': '戴荃',
} 'subtitles': {
'origin': [{
'ext': 'lrc',
}],
}, },
] }
}]
def _real_extract(self, url): def _real_extract(self, url):
_id = self._match_id(url) return self._extract_tracks(self._match_id(url))[0]
return self._extract_xml(_id)[0]
class XiamiAlbumIE(XiamiBaseIE): class XiamiPlaylistBaseIE(XiamiBaseIE):
def _real_extract(self, url):
item_id = self._match_id(url)
return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id)
class XiamiAlbumIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:album' IE_NAME = 'xiami:album'
IE_DESC = '虾米音乐 - 专辑' IE_DESC = '虾米音乐 - 专辑'
_VALID_URL = r'http://www\.xiami\.com/album/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)'
_TESTS = [ _TYPE = '1'
{ _TESTS = [{
'url': 'http://www.xiami.com/album/2100300444', 'url': 'http://www.xiami.com/album/2100300444',
'info_dict': { 'info_dict': {
'id': '2100300444', 'id': '2100300444',
}, },
'playlist_count': 10, 'playlist_count': 10,
}, }, {
{
'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
'only_matching': True, 'only_matching': True,
} }]
]
def _real_extract(self, url):
_id = self._match_id(url)
return self.playlist_result(self._extract_xml(_id, '/type/1'), _id)
class XiamiArtistIE(XiamiBaseIE): class XiamiArtistIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:artist' IE_NAME = 'xiami:artist'
IE_DESC = '虾米音乐 - 歌手' IE_DESC = '虾米音乐 - 歌手'
_VALID_URL = r'http://www\.xiami\.com/artist/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)'
_TYPE = '2'
_TEST = { _TEST = {
'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
'info_dict': { 'info_dict': {
@ -139,23 +143,16 @@ class XiamiArtistIE(XiamiBaseIE):
'playlist_count': 20, 'playlist_count': 20,
} }
def _real_extract(self, url):
_id = self._match_id(url)
return self.playlist_result(self._extract_xml(_id, '/type/2'), _id)
class XiamiCollectionIE(XiamiPlaylistBaseIE):
class XiamiCollectionIE(XiamiBaseIE):
IE_NAME = 'xiami:collection' IE_NAME = 'xiami:collection'
IE_DESC = '虾米音乐 - 精选集' IE_DESC = '虾米音乐 - 精选集'
_VALID_URL = r'http://www\.xiami\.com/collect/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)'
_TYPE = '3'
_TEST = { _TEST = {
'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
'info_dict': { 'info_dict': {
'id': '156527391', 'id': '156527391',
}, },
'playlist_count': 26, 'playlist_mincount': 29,
} }
def _real_extract(self, url):
_id = self._match_id(url)
return self.playlist_result(self._extract_xml(_id, '/type/3'), _id)