From 4e0c0c1508810eb494cd32ef00fb75d03d03ce6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 21:50:23 +0600 Subject: [PATCH] [xiami] Improve extraction (Closes #9079) * Switch to JSON source * Add abstract IE for playlists * Extract more track related metadata --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/xiami.py | 199 ++++++++++++++--------------- 2 files changed, 99 insertions(+), 102 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 14ca9eaee..737960a01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -942,7 +942,7 @@ from .xhamster import ( XHamsterEmbedIE, ) from .xiami import ( - XiamiIE, + XiamiSongIE, XiamiAlbumIE, XiamiArtistIE, XiamiCollectionIE diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index a28d63c48..e4ed306b4 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -1,50 +1,42 @@ -# -*- coding: utf-8 -*- - +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - xpath_element, - xpath_text, - xpath_with_ns, - int_or_none, - ExtractorError -) from ..compat import compat_urllib_parse_unquote +from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): - - _XML_BASE_URL = 'http://www.xiami.com/song/playlist/id' - _NS_MAP = {'xm': 'http://xspf.org/ns/0/'} - - def _extract_track(self, track): - artist = xpath_text(track, xpath_with_ns('xm:artist', self._NS_MAP), default='') - artist = artist.split(';') - - ret = { - 'id': xpath_text(track, xpath_with_ns('xm:song_id', self._NS_MAP)), - 'title': xpath_text(track, xpath_with_ns('xm:title', self._NS_MAP)), - 'album': xpath_text(track, xpath_with_ns('xm:album_name', self._NS_MAP)), - 'artist': ';'.join(artist) if artist else None, - 'creator': artist[0] if artist else None, - 'url': self._decrypt(xpath_text(track, xpath_with_ns('xm:location', self._NS_MAP))), - 'thumbnail': xpath_text(track, xpath_with_ns('xm:pic', self._NS_MAP), default=None), - 'duration': int_or_none(xpath_text(track, xpath_with_ns('xm:length', self._NS_MAP))), + _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' + + def _extract_track(self, track, track_id=None): + title = track['title'] + track_url = self._decrypt(track['location']) + + subtitles = {} + lyrics_url = track.get('lyric_url') or track.get('lyric') + if lyrics_url and lyrics_url.startswith('http'): + subtitles['origin'] = [{'url': lyrics_url}] + + return { + 'id': track.get('song_id') or track_id, + 'url': track_url, + 'title': title, + 'thumbnail': track.get('pic') or track.get('album_pic'), + 'duration': int_or_none(track.get('length')), + 'creator': track.get('artist', '').split(';')[0], + 'track': title, + 'album': track.get('album_name'), + 'artist': track.get('artist'), + 'subtitles': subtitles, } - lyrics_url = xpath_text(track, xpath_with_ns('xm:lyric', self._NS_MAP)) - if lyrics_url and lyrics_url.endswith('.lrc'): - ret['description'] = self._download_webpage(lyrics_url, ret['id']) - return ret - - def _extract_xml(self, _id, typ=''): - playlist = self._download_xml('%s/%s%s' % (self._XML_BASE_URL, _id, typ), _id) - tracklist = xpath_element(playlist, xpath_with_ns('./xm:trackList', self._NS_MAP)) - - if not len(tracklist): - raise ExtractorError('No track found') - return [self._extract_track(track) for track in tracklist] + def _extract_tracks(self, item_id, typ=None): + playlist = self._download_json( + '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id) + return [ + self._extract_track(track, item_id) + for track in playlist['data']['trackList']] @staticmethod def _decrypt(origin): @@ -62,75 +54,87 @@ class XiamiBaseIE(InfoExtractor): ans = '' for i in range(0, short_lenth + 1): for j in range(0, n): - if len(l[j])>i: + if len(l[j]) > i: ans += l[j][i] return compat_urllib_parse_unquote(ans).replace('^', '0') -class XiamiIE(XiamiBaseIE): +class XiamiSongIE(XiamiBaseIE): IE_NAME = 'xiami:song' IE_DESC = '虾米音乐' - _VALID_URL = r'http://www\.xiami\.com/song/(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.xiami.com/song/1775610518', - 'md5': '521dd6bea40fd5c9c69f913c232cb57e', - 'info_dict': { - 'id': '1775610518', - 'ext': 'mp3', - 'title': 'Woman', - 'creator': 'HONNE', - 'album': 'Woman', - 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', - 'description': 'md5:052ec7de41ca19f67e7fd70a1bfc4e0b', - } - }, - { - 'url': 'http://www.xiami.com/song/1775256504', - 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', - 'info_dict': { - 'id': '1775256504', - 'ext': 'mp3', - 'title': '悟空', - 'creator': '戴荃', - 'album': '悟空', - 'description': 'md5:206e67e84f9bed1d473d04196a00b990', - } - }, - ] + _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.xiami.com/song/1775610518', + 'md5': '521dd6bea40fd5c9c69f913c232cb57e', + 'info_dict': { + 'id': '1775610518', + 'ext': 'mp3', + 'title': 'Woman', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 265, + 'creator': 'HONNE', + 'track': 'Woman', + 'album': 'Woman', + 'artist': 'HONNE', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + } + }, { + 'url': 'http://www.xiami.com/song/1775256504', + 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', + 'info_dict': { + 'id': '1775256504', + 'ext': 'mp3', + 'title': '悟空', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 200, + 'creator': '戴荃', + 'track': '悟空', + 'album': '悟空', + 'artist': '戴荃', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }], + }, + } + }] def _real_extract(self, url): - _id = self._match_id(url) - return self._extract_xml(_id)[0] + return self._extract_tracks(self._match_id(url))[0] -class XiamiAlbumIE(XiamiBaseIE): +class XiamiPlaylistBaseIE(XiamiBaseIE): + def _real_extract(self, url): + item_id = self._match_id(url) + return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id) + + +class XiamiAlbumIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:album' IE_DESC = '虾米音乐 - 专辑' - _VALID_URL = r'http://www\.xiami\.com/album/(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.xiami.com/album/2100300444', - 'info_dict': { - 'id': '2100300444', - }, - 'playlist_count': 10, + _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P[0-9]+)' + _TYPE = '1' + _TESTS = [{ + 'url': 'http://www.xiami.com/album/2100300444', + 'info_dict': { + 'id': '2100300444', }, - { - 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - _id = self._match_id(url) - return self.playlist_result(self._extract_xml(_id, '/type/1'), _id) + 'playlist_count': 10, + }, { + 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', + 'only_matching': True, + }] -class XiamiArtistIE(XiamiBaseIE): +class XiamiArtistIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:artist' IE_DESC = '虾米音乐 - 歌手' - _VALID_URL = r'http://www\.xiami\.com/artist/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P[0-9]+)' + _TYPE = '2' _TEST = { 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', 'info_dict': { @@ -139,23 +143,16 @@ class XiamiArtistIE(XiamiBaseIE): 'playlist_count': 20, } - def _real_extract(self, url): - _id = self._match_id(url) - return self.playlist_result(self._extract_xml(_id, '/type/2'), _id) - -class XiamiCollectionIE(XiamiBaseIE): +class XiamiCollectionIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:collection' IE_DESC = '虾米音乐 - 精选集' - _VALID_URL = r'http://www\.xiami\.com/collect/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P[0-9]+)' + _TYPE = '3' _TEST = { 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', 'info_dict': { 'id': '156527391', }, - 'playlist_count': 26, + 'playlist_mincount': 29, } - - def _real_extract(self, url): - _id = self._match_id(url) - return self.playlist_result(self._extract_xml(_id, '/type/3'), _id)