[hearthisat] Add new extractor (Closes #4743)

2024-01-07 17:16:08 +00:00 · 2015-01-21 21:47:55 +02:00 · 2015-01-21 21:47:55 +02:00 · e5763a7a7e
parent 8bb1bdfae9
commit e5763a7a7e
2 changed files with 93 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -176,6 +176,7 @@ from .goshgay import GoshgayIE
 from .grooveshark import GroovesharkIE
 from .groupon import GrouponIE
 from .hark import HarkIE
 from .hearthisat import HearThisAtIE
 from .heise import HeiseIE
 from .hellporno import HellPornoIE
 from .helsinki import HelsinkiIE
--- a/youtube_dl/extractor/hearthisat.py
+++ b/youtube_dl/extractor/hearthisat.py
@ -0,0 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_urllib_request
 from ..utils import (
    str_to_int,
    urlencode_postdata,
 )
 class HearThisAtIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
    _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
    _TEST = {
        'url': 'https://hearthis.at/moofi/dr-kreep',
        'md5': 'd594c573227a89f4256f0b03e68c80cc',
        'info_dict': {
            'id': '150939',
            'ext': 'mp3',
            'title': 'Moofi - Dr. Kreep',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1421564134,
            'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
            'upload_date': '20150118',
            'comment_count': int,
            'view_count': int,
            'like_count': int,
            'duration': 71,
            'categories': ['Experimental'],
        }
    }
    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url)
        display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
        webpage = self._download_webpage(url, display_id)
        track_id = self._search_regex(
            r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
        payload = urlencode_postdata({'tracks[]': track_id})
        req = compat_urllib_request.Request(self._PLAYLIST_URL, payload)
        req.add_header('Content-type', 'application/x-www-form-urlencoded')
        track = self._download_json(req, track_id, 'Downloading playlist')[0]
        title = '{artist:s} - {title:s}'.format(**track)
        categories = None
        if track.get('category'):
            categories = [track['category']]
        description = self._og_search_description(webpage)
        thumbnail = self._og_search_thumbnail(webpage)
        meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
        view_count = str_to_int(self._search_regex(
            meta_span % 'plays_count', webpage, 'view count', fatal=False))
        like_count = str_to_int(self._search_regex(
            meta_span % 'likes_count', webpage, 'like count', fatal=False))
        comment_count = str_to_int(self._search_regex(
            meta_span % 'comment_count', webpage, 'comment count', fatal=False))
        duration = str_to_int(self._search_regex(
            r'data-length="(\d+)', webpage, 'duration', fatal=False))
        timestamp = str_to_int(self._search_regex(
            r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
        track_url = self._search_regex(
            r'<a[^>]+data-mp3="([^"]+)"', webpage, 'track URL')
        formats = [{
            'format_id': 'mp3',
            'url': track_url,
            'vcodec': 'none',
        }]
        return {
            'id': track_id,
            'display-id': display_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail,
            'description': description,
            'duration': duration,
            'timestamp': timestamp,
            'view_count': view_count,
            'comment_count': comment_count,
            'like_count': like_count,
            'categories': categories,
        }