From 8f73e89ca0ecde0a8bbd1f1463e9a06a53c6b573 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 14:15:54 +0800 Subject: [PATCH 01/10] [kuwo] New extractor for kuwo.cn --- youtube_dl/extractor/__init__.py | 9 + youtube_dl/extractor/kuwo.py | 326 +++++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+) create mode 100644 youtube_dl/extractor/kuwo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..82cd85c44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -257,6 +257,15 @@ from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoSingerMusicIE, + KuwoCategoryIE, + KuwoMvIE, +) from .la7 import LA7IE from .laola1tv import Laola1TvIE from .letv import ( diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py new file mode 100644 index 000000000..6a96a1aa4 --- /dev/null +++ b/youtube_dl/extractor/kuwo.py @@ -0,0 +1,326 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + clean_html, + ExtractorError, +) + + +class KuwoIE(InfoExtractor): + IE_NAME = 'kuwo:song' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/yinyue/635632/', + 'info_dict': { + 'id': '635632', + 'ext': 'ape', + 'title': '爱我别走', + 'creator': '张震岳', + 'upload_date': '20080122', + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/6446136/', + 'info_dict': { + 'id': '6446136', + 'ext': 'mp3', + 'title': '心', + 'creator': 'IU', + 'upload_date': '20150518', + }, + 'params': { + 'format': 'mp3-320' + }, + }] + _FORMATS = [ + {'format': 'ape', 'ext': 'ape', 'preference': 100}, + {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, + {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, + {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, + {'format': 'wma', 'ext': 'wma', 'preference': 20}, + {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} + ] + + def _get_formats(self, song_id): + formats = [] + for file_format in self._FORMATS: + song_url = self._download_webpage( + "http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url" % + (file_format['ext'], file_format.get('br', ''), song_id), + song_id, note="Download %s url info" % file_format["format"], + ) + if song_url.startswith('http://') or song_url.startswith('https://'): + formats.append({ + 'url': song_url, + 'format_id': file_format['format'], + 'format': file_format['format'], + 'preference': file_format['preference'], + 'abr': file_format.get('abr'), + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage( + url, song_id, note='Download song detail info', + errnote='Unable to get song detail info') + + song_name = self._html_search_regex( + r'

', webpage, 'song name') + singer_name = self._html_search_regex( + r'
.+?title="(.+?)".+?
', webpage, 'singer name', + flags=re.DOTALL, default=None) + lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) + if lrc_content == '暂无': # indicates no lyrics + lrc_content = None + + formats = self._get_formats(song_id) + + album_id = self._html_search_regex( + r'

.+?[0-9]+?)/' + _TEST = { + 'url': 'http://www.kuwo.cn/album/502294/', + 'info_dict': { + 'id': '502294', + 'title': 'M', + 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + webpage = self._download_webpage( + url, album_id, note='Download album info', + errnote='Unable to get album info') + + album_name = self._html_search_regex( + r'

', webpage, + 'album name', flags=re.DOTALL) + album_intro = clean_html( + re.sub(r'^.+简介:', '', get_element_by_id("intro", webpage).strip())) + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'

', + webpage) + ] + return self.playlist_result(entries, album_id, album_name, album_intro) + + +class KuwoChartIE(InfoExtractor): + IE_NAME = 'kuwo:chart' + _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P.+?).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', + 'info_dict': { + 'id': '香港中文龙虎榜', + 'title': '香港中文龙虎榜', + 'description': 're:[0-9]{4}第[0-9]{2}期', + }, + 'playlist_mincount': 10, + } + + def _real_extract(self, url): + chart_id = self._match_id(url) + webpage = self._download_webpage( + url, chart_id, note='Download chart info', + errnote='Unable to get chart info') + + chart_name = self._html_search_regex( + r'

(.+?)

', webpage, 'chart name') + + chart_desc = self._html_search_regex( + r'

([0-9]{4}第[0-9]{2}期)

', webpage, 'chart desc') + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'.+?', webpage) + ] + return self.playlist_result(entries, chart_id, chart_name, chart_desc) + + +class KuwoSingerIE(InfoExtractor): + IE_NAME = 'kuwo:singer' + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/$' + _TEST = { + 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', + 'info_dict': { + 'id': 'bruno+mars', + 'title': 'Bruno Mars', + }, + 'playlist_count': 10, + } + + def _real_extract(self, url): + singer_id = self._match_id(url) + webpage = self._download_webpage( + url, singer_id, note='Download singer info', + errnote='Unable to get singer info') + + singer_name = self._html_search_regex( + r'姓名:(.+?)', webpage, 'singer name') + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'.+?', + webpage, flags=re.DOTALL) + ] + return self.playlist_result(entries, singer_id, singer_name) + + +class KuwoSingerMusicIE(InfoExtractor): + IE_NAME = 'kuwo:singermusic' + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/music(_[0-9]+)?.htm' + _TEST = { + 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', + 'info_dict': { + 'id': 'Ali', + 'title': 'Ali的热门歌曲', + }, + 'playlist_mincount': 95, + } + + def _real_extract(self, url): + singer_id = self._match_id(url) + + list_name = None + entries = [] + for page_num in itertools.count(1): + webpage = self._download_webpage( + 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), + singer_id, note='Download song list page #%d' % page_num, + errnote='Unable to get song list page #%d' % page_num) + + if list_name is None: + list_name = self._html_search_regex( + r'

([^<>]+)', webpage, 'list name') + + entries.extend([ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'

下一页', webpage): + break + + return self.playlist_result(entries, singer_id, list_name) + + +class KuwoCategoryIE(InfoExtractor): + IE_NAME = 'kuwo:category' + _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P[0-9]+?).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', + 'info_dict': { + 'id': '86375', + 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', + }, + 'playlist_count': 30, + } + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage( + url, category_id, note='Download category info', + errnote='Unable to get category info') + + category_name = self._html_search_regex( + r'

[^<>]+?

', webpage, 'category name') + + category_desc = re.sub( + r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) + + jsonm = self._parse_json(self._html_search_regex( + r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) + + entries = [ + self.url_result( + "http://www.kuwo.cn/yinyue/%s/" % song['musicrid'], + 'Kuwo', song['musicrid']) + for song in jsonm['musiclist'] + ] + return self.playlist_result(entries, category_id, category_name, category_desc) + + +class KuwoMvIE(KuwoIE): + IE_NAME = 'kuwo:mv' + _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/mv/6480076/', + 'info_dict': { + 'id': '6480076', + 'ext': 'mkv', + 'title': '我们家MV', + 'creator': '2PM', + }, + }] + _FORMATS = KuwoIE._FORMATS + [ + {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, + {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, + ] + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage( + url, song_id, note='Download mv detail info: %s' % song_id, + errnote='Unable to get mv detail info: %s' % song_id) + + mobj = re.search( + r'

[^<>]+[^<>]+

', + webpage) + if mobj: + song_name = mobj.group('song') + singer_name = mobj.group('singer') + else: + raise ExtractorError("Unable to find song or singer names") + + formats = self._get_formats(song_id) + + return { + 'id': song_id, + 'title': song_name, + 'creator': singer_name, + 'formats': formats, + } From 2b0fa1f7dd8d158b69eec4d17b254b99e976bc5c Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 10 Jul 2015 15:09:12 +0800 Subject: [PATCH 02/10] [kuwo] Merge KuwoSingerMusicIE into KuwoSingerIE --- youtube_dl/extractor/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 82cd85c44..a348b3077 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,7 +262,6 @@ from .kuwo import ( KuwoAlbumIE, KuwoChartIE, KuwoSingerIE, - KuwoSingerMusicIE, KuwoCategoryIE, KuwoMvIE, ) From 1633491bff3e393d7d095b7303f954dacce4f4da Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 10 Jul 2015 15:19:07 +0800 Subject: [PATCH 03/10] [kuwo] Merge KuwoSingerMusicIE into KuwoSingerIE (missed kuwo.py) --- youtube_dl/extractor/kuwo.py | 53 ++++++++++++------------------------ 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 6a96a1aa4..82d5f3f95 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -180,15 +180,22 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/$' - _TEST = { + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+)' + _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, 'playlist_count': 10, - } + }, { + 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', + 'info_dict': { + 'id': 'Ali', + 'title': 'Ali', + }, + 'playlist_mincount': 95, + }] def _real_extract(self, url): singer_id = self._match_id(url) @@ -197,54 +204,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'姓名:(.+?)', webpage, 'singer name') + r'
[\n\s\t]*?

(.+?).+?', - webpage, flags=re.DOTALL) - ] - return self.playlist_result(entries, singer_id, singer_name) - - -class KuwoSingerMusicIE(InfoExtractor): - IE_NAME = 'kuwo:singermusic' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/music(_[0-9]+)?.htm' - _TEST = { - 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', - 'info_dict': { - 'id': 'Ali', - 'title': 'Ali的热门歌曲', - }, - 'playlist_mincount': 95, - } - - def _real_extract(self, url): - singer_id = self._match_id(url) - - list_name = None entries = [] + first_page_only = False if re.match(r'.+/music(?:_[0-9]+)?\.htm', url) else True for page_num in itertools.count(1): webpage = self._download_webpage( 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), singer_id, note='Download song list page #%d' % page_num, errnote='Unable to get song list page #%d' % page_num) - if list_name is None: - list_name = self._html_search_regex( - r'

([^<>]+)', webpage, 'list name') - entries.extend([ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( r'

下一页', webpage): + ][:10 if first_page_only else None]) + + if first_page_only or not re.search(r'下一页', webpage): break - return self.playlist_result(entries, singer_id, list_name) + return self.playlist_result(entries, singer_id, singer_name) class KuwoCategoryIE(InfoExtractor): From a34af8d0667d8f4ceba3380f808a6d563ca01d77 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 19:13:52 +0800 Subject: [PATCH 04/10] [kuwo] PEP8 --- youtube_dl/extractor/kuwo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 82d5f3f95..9c62191b5 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -249,10 +249,10 @@ class KuwoCategoryIE(InfoExtractor): category_name = self._html_search_regex( r'

[^<>]+?

', webpage, 'category name') - + category_desc = re.sub( r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) - + jsonm = self._parse_json(self._html_search_regex( r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) From a31e3e7dcb9d0471d90ec8562934a144d25d7132 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:23:07 +0800 Subject: [PATCH 05/10] [kuwo] Regular expression improvements 1. Prevent .+ and .* 2. Use [^>]+ instead of spaces for HTML tags 3. Remove unnecessary trailing parts --- youtube_dl/extractor/kuwo.py | 47 +++++++++++++++------------- youtube_dl/extractor/neteasemusic.py | 2 +- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 9c62191b5..1095a26e2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -9,6 +9,7 @@ from ..utils import ( get_element_by_id, clean_html, ExtractorError, + remove_start, ) @@ -73,10 +74,10 @@ class KuwoIE(InfoExtractor): errnote='Unable to get song detail info') song_name = self._html_search_regex( - r'

', webpage, 'song name') + r']+title="([^"]+)">', webpage, 'song name') singer_name = self._html_search_regex( - r'
.+?title="(.+?)".+?
', webpage, 'singer name', - flags=re.DOTALL, default=None) + r']+class="s_img">\s*]+title="([^>]+)"', + webpage, 'singer name', default=None) lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -84,7 +85,7 @@ class KuwoIE(InfoExtractor): formats = self._get_formats(song_id) album_id = self._html_search_regex( - r'

.+?]+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', webpage, 'album id', default=None, fatal=False) publish_time = None @@ -131,15 +132,16 @@ class KuwoAlbumIE(InfoExtractor): errnote='Unable to get album info') album_name = self._html_search_regex( - r'

', webpage, - 'album name', flags=re.DOTALL) - album_intro = clean_html( - re.sub(r'^.+简介:', '', get_element_by_id("intro", webpage).strip())) + r']+class="comm"[^<]+]+title="([^"]+)"', webpage, + 'album name') + album_intro = remove_start( + clean_html(get_element_by_id("intro", webpage)), + '%s简介:' % album_name) entries = [ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( - r'

', + r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] return self.playlist_result(entries, album_id, album_name, album_intro) @@ -147,7 +149,7 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P.+?).htm' + _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { @@ -165,15 +167,15 @@ class KuwoChartIE(InfoExtractor): errnote='Unable to get chart info') chart_name = self._html_search_regex( - r'

(.+?)

', webpage, 'chart name') + r']+class="unDis">([^<]+)

', webpage, 'chart name') chart_desc = self._html_search_regex( - r'

([0-9]{4}第[0-9]{2}期)

', webpage, 'chart desc') + r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( - r'.+?', webpage) + r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] return self.playlist_result(entries, chart_id, chart_name, chart_desc) @@ -204,11 +206,11 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'
[\n\s\t]*?

(.+?)\s*

([^<]+)]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', webpage) ][:10 if first_page_only else None]) - if first_page_only or not re.search(r'下一页', webpage): + if first_page_only or not re.search(r']+href="[^"]+">下一页', webpage): break return self.playlist_result(entries, singer_id, singer_name) @@ -248,13 +250,14 @@ class KuwoCategoryIE(InfoExtractor): errnote='Unable to get category info') category_name = self._html_search_regex( - r'

[^<>]+?

', webpage, 'category name') + r']+title="([^<>]+?)">[^<>]+?

', webpage, 'category name') - category_desc = re.sub( - r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) + category_desc = remove_start( + get_element_by_id("intro", webpage).strip(), + '%s简介:' % category_name) jsonm = self._parse_json(self._html_search_regex( - r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) + r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ self.url_result( @@ -289,7 +292,7 @@ class KuwoMvIE(KuwoIE): errnote='Unable to get mv detail info: %s' % song_id) mobj = re.search( - r'

[^<>]+[^<>]+

', + r']+title="(?P[^"]+)">[^<]+]+title="(?P[^"]+)"', webpage) if mobj: song_name = mobj.group('song') diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index bdfe7e63f..ee52efaee 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -229,7 +229,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): if info['artist']['trans']: name = '%s - %s' % (name, info['artist']['trans']) if info['artist']['alias']: - name = '%s - %s' % (name, ";".join(info['artist']['alias'])) + name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) entries = [ self.url_result('http://music.163.com/#/song?id=%s' % song['id'], From a9684c0dbf3879478fd223ce7594d58be7dffa4f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:46:44 +0800 Subject: [PATCH 06/10] [kuwo] Add KuwoBaseIE --- youtube_dl/extractor/kuwo.py | 63 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 1095a26e2..928f7f62d 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -13,32 +13,7 @@ from ..utils import ( ) -class KuwoIE(InfoExtractor): - IE_NAME = 'kuwo:song' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' - _TESTS = [{ - 'url': 'http://www.kuwo.cn/yinyue/635632/', - 'info_dict': { - 'id': '635632', - 'ext': 'ape', - 'title': '爱我别走', - 'creator': '张震岳', - 'upload_date': '20080122', - 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' - }, - }, { - 'url': 'http://www.kuwo.cn/yinyue/6446136/', - 'info_dict': { - 'id': '6446136', - 'ext': 'mp3', - 'title': '心', - 'creator': 'IU', - 'upload_date': '20150518', - }, - 'params': { - 'format': 'mp3-320' - }, - }] +class KuwoBaseIE(InfoExtractor): _FORMATS = [ {'format': 'ape', 'ext': 'ape', 'preference': 100}, {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, @@ -67,6 +42,34 @@ class KuwoIE(InfoExtractor): self._sort_formats(formats) return formats + +class KuwoIE(KuwoBaseIE): + IE_NAME = 'kuwo:song' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/yinyue/635632/', + 'info_dict': { + 'id': '635632', + 'ext': 'ape', + 'title': '爱我别走', + 'creator': '张震岳', + 'upload_date': '20080122', + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/6446136/', + 'info_dict': { + 'id': '6446136', + 'ext': 'mp3', + 'title': '心', + 'creator': 'IU', + 'upload_date': '20150518', + }, + 'params': { + 'format': 'mp3-320' + }, + }] + def _real_extract(self, url): song_id = self._match_id(url) webpage = self._download_webpage( @@ -268,10 +271,10 @@ class KuwoCategoryIE(InfoExtractor): return self.playlist_result(entries, category_id, category_name, category_desc) -class KuwoMvIE(KuwoIE): +class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' - _TESTS = [{ + _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', @@ -279,8 +282,8 @@ class KuwoMvIE(KuwoIE): 'title': '我们家MV', 'creator': '2PM', }, - }] - _FORMATS = KuwoIE._FORMATS + [ + } + _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, ] From cf2c5fda4f3e753cc64098e6a751cf1a220efae7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:48:48 +0800 Subject: [PATCH 07/10] [kuwo] Use single quotes --- youtube_dl/extractor/kuwo.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 928f7f62d..2b5321cc2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -27,9 +27,9 @@ class KuwoBaseIE(InfoExtractor): formats = [] for file_format in self._FORMATS: song_url = self._download_webpage( - "http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url" % + 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % (file_format['ext'], file_format.get('br', ''), song_id), - song_id, note="Download %s url info" % file_format["format"], + song_id, note='Download %s url info' % file_format['format'], ) if song_url.startswith('http://') or song_url.startswith('https://'): formats.append({ @@ -81,7 +81,7 @@ class KuwoIE(KuwoBaseIE): singer_name = self._html_search_regex( r']+class="s_img">\s*]+title="([^>]+)"', webpage, 'singer name', default=None) - lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) + lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -94,7 +94,7 @@ class KuwoIE(KuwoBaseIE): publish_time = None if album_id is not None: album_info_page = self._download_webpage( - "http://www.kuwo.cn/album/%s/" % album_id, song_id, + 'http://www.kuwo.cn/album/%s/' % album_id, song_id, note='Download album detail info', errnote='Unable to get album detail info') @@ -138,11 +138,11 @@ class KuwoAlbumIE(InfoExtractor): r']+class="comm"[^<]+]+title="([^"]+)"', webpage, 'album name') album_intro = remove_start( - clean_html(get_element_by_id("intro", webpage)), + clean_html(get_element_by_id('intro', webpage)), '%s简介:' % album_name) entries = [ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) @@ -176,7 +176,7 @@ class KuwoChartIE(InfoExtractor): r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] @@ -221,7 +221,7 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get song list page #%d' % page_num) entries.extend([ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+class="m_name">]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', webpage) @@ -256,7 +256,7 @@ class KuwoCategoryIE(InfoExtractor): r']+title="([^<>]+?)">[^<>]+?

', webpage, 'category name') category_desc = remove_start( - get_element_by_id("intro", webpage).strip(), + get_element_by_id('intro', webpage).strip(), '%s简介:' % category_name) jsonm = self._parse_json(self._html_search_regex( @@ -264,7 +264,7 @@ class KuwoCategoryIE(InfoExtractor): entries = [ self.url_result( - "http://www.kuwo.cn/yinyue/%s/" % song['musicrid'], + 'http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo', song['musicrid']) for song in jsonm['musiclist'] ] @@ -301,7 +301,7 @@ class KuwoMvIE(KuwoBaseIE): song_name = mobj.group('song') singer_name = mobj.group('singer') else: - raise ExtractorError("Unable to find song or singer names") + raise ExtractorError('Unable to find song or singer names') formats = self._get_formats(song_id) From d3b8908886a78dd63441754f1ac20b70cb29de56 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:53:48 +0800 Subject: [PATCH 08/10] [kuwo] Simpler calls to url_result() --- youtube_dl/extractor/kuwo.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 2b5321cc2..69afacac9 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -142,9 +142,8 @@ class KuwoAlbumIE(InfoExtractor): '%s简介:' % album_name) entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+class="listen">]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', webpage) ] return self.playlist_result(entries, album_id, album_name, album_intro) @@ -176,9 +175,8 @@ class KuwoChartIE(InfoExtractor): r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) ] return self.playlist_result(entries, chart_id, chart_name, chart_desc) @@ -221,9 +219,8 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get song list page #%d' % page_num) entries.extend([ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+class="m_name">]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+class="m_name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', webpage) ][:10 if first_page_only else None]) @@ -263,9 +260,7 @@ class KuwoCategoryIE(InfoExtractor): r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ - self.url_result( - 'http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], - 'Kuwo', song['musicrid']) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') for song in jsonm['musiclist'] ] return self.playlist_result(entries, category_id, category_name, category_desc) From 9f01c1a803d06034d443882f57528eba342a9c94 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:56:51 +0800 Subject: [PATCH 09/10] [kuwo] Use \d instead of [0-9] --- youtube_dl/extractor/kuwo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 69afacac9..18bf66404 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -45,7 +45,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P\d+?)/' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -116,7 +116,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/album/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -157,7 +157,7 @@ class KuwoChartIE(InfoExtractor): 'info_dict': { 'id': '香港中文龙虎榜', 'title': '香港中文龙虎榜', - 'description': 're:[0-9]{4}第[0-9]{2}期', + 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -211,7 +211,7 @@ class KuwoSingerIE(InfoExtractor): ) entries = [] - first_page_only = False if re.search(r'/music(?:_[0-9]+)?\.htm', url) else True + first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True for page_num in itertools.count(1): webpage = self._download_webpage( 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), @@ -232,7 +232,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P[0-9]+?).htm' + _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -268,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { From 094790d2c963ef7ee9f5861e0458174ddd15ed87 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:03:49 +0800 Subject: [PATCH 10/10] [kuwo:song] Give warnings for unavailable optional fields --- youtube_dl/extractor/kuwo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 18bf66404..a021f3cdf 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -80,7 +80,7 @@ class KuwoIE(KuwoBaseIE): r']+title="([^"]+)">', webpage, 'song name') singer_name = self._html_search_regex( r']+class="s_img">\s*]+title="([^>]+)"', - webpage, 'singer name', default=None) + webpage, 'singer name', fatal=False) lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -89,7 +89,7 @@ class KuwoIE(KuwoBaseIE): album_id = self._html_search_regex( r']+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', - webpage, 'album id', default=None, fatal=False) + webpage, 'album id', fatal=False) publish_time = None if album_id is not None: @@ -100,7 +100,7 @@ class KuwoIE(KuwoBaseIE): publish_time = self._html_search_regex( r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, - 'publish time', default=None) + 'publish time', fatal=False) if publish_time: publish_time = publish_time.replace('-', '')