youtube-dl/youtube_dl/extractor/archiveorg.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor
from ..utils import (
    unified_strdate,
)


class ArchiveOrgIE(InfoExtractor):
    IE_NAME = 'archive.org'
    IE_DESC = 'archive.org videos'
    _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
    _TEST = {
        "url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
        'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
        'md5': '8af1d4cf447933ed3c7f4871162602db',
        'info_dict': {
            "title": "1968 Demo - FJCC Conference Presentation Reel #1",
            "description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
            "upload_date": "19681210",
            "uploader": "SRI International"
        }
    }

    def get_optional_metadata(self, data, field):
        try:
            return data['metadata'][field][0]
        except KeyError:
            return None

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        json_url = url + ('?' if '?' in url else '&') + 'output=json'
        json_data = self._download_webpage(json_url, video_id)
        data = json.loads(json_data)

        title = self.get_optional_metadata(data, 'title')
        description = self.get_optional_metadata(data, 'description')
        uploader = self.get_optional_metadata(data, 'creator')
        upload_date = self.get_optional_metadata(data, 'date')
        if upload_date:
            upload_date = unified_strdate(upload_date)

        formats = [
            {
                'format': fdata['format'],
                'url': 'http://' + data['server'] + data['dir'] + fn,
                'file_size': int(fdata['size']),
            }
            for fn, fdata in data['files'].items()
            if 'Video' in fdata['format']]

        self._sort_formats(formats)

        return {
            '_type': 'video',
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': description,
            'uploader': uploader,
            'upload_date': upload_date,
            'thumbnail': data.get('misc', {}).get('image'),
        }
More unicode literals 11 years ago			`from __future__ import unicode_literals`

[archive.org] Add extractor (Fixes #1003) 11 years ago			`import json`
			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
			`unified_strdate,`
			`)`


			`class ArchiveOrgIE(InfoExtractor):`
			`IE_NAME = 'archive.org'`
			`IE_DESC = 'archive.org videos'`
Correct some extractor _VALID_URL regexes 11 years ago			`_VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`_TEST = {`
More unicode literals 11 years ago			`"url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",`
			`'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',`
			`'md5': '8af1d4cf447933ed3c7f4871162602db',`
			`'info_dict': {`
			`"title": "1968 Demo - FJCC Conference Presentation Reel #1",`
			"description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 \| <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> \| <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
			`"upload_date": "19681210",`
			`"uploader": "SRI International"`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`}`
			`}`

[archiveorg] most metadata fields are optional Example: https://archive.org/details/Cops1922 10 years ago			`def get_optional_metadata(self, data, field):`
			`try:`
			`return data['metadata'][field][0]`
			`except KeyError:`
			`return None`

[archive.org] Add extractor (Fixes #1003) 11 years ago			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`

More unicode literals 11 years ago			`json_url = url + ('?' if '?' in url else '&') + 'output=json'`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`json_data = self._download_webpage(json_url, video_id)`
			`data = json.loads(json_data)`

[archiveorg] most metadata fields are optional Example: https://archive.org/details/Cops1922 10 years ago			`title = self.get_optional_metadata(data, 'title')`
			`description = self.get_optional_metadata(data, 'description')`
			`uploader = self.get_optional_metadata(data, 'creator')`
			`upload_date = self.get_optional_metadata(data, 'date')`
			`if upload_date:`
			`upload_date = unified_strdate(upload_date)`
[archive.org] Add extractor (Fixes #1003) 11 years ago
[archiveorg] Use centralized sorting 11 years ago			`formats = [`
			`{`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`'format': fdata['format'],`
			`'url': 'http://' + data['server'] + data['dir'] + fn,`
			`'file_size': int(fdata['size']),`
			`}`
[archiveorg] Use centralized sorting 11 years ago			`for fn, fdata in data['files'].items()`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`if 'Video' in fdata['format']]`
[archiveorg] Use centralized sorting 11 years ago
			`self._sort_formats(formats)`
[archive.org] Add extractor (Fixes #1003) 11 years ago
Remove the compatibility code used before the new format system was implemented 11 years ago			`return {`
Remove video_result helper method Calling it was more complex then actually including the type in the video info 11 years ago			`'_type': 'video',`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
			`'description': description,`
			`'uploader': uploader,`
			`'upload_date': upload_date,`
Remove the compatibility code used before the new format system was implemented 11 years ago			`'thumbnail': data.get('misc', {}).get('image'),`
[archive.org] Add extractor (Fixes #1003) 11 years ago			`}`