release 2015.02.10.2

[YoutubeDL] Add generic video filtering (Fixes #4916 )
This functionality is intended to eventually encompass the current format filtering.
2025-10-29 09:26:20 -07:00 · 2015-02-10 03:32:55 +01:00 · 2015-02-10 03:32:24 +01:00 · 2015-02-10 01:46:09 +01:00 · 2015-02-10 01:40:55 +01:00 · 2015-02-10 01:37:14 +01:00
12 changed files with 220 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -119,6 +119,23 @@ which means you can modify it, redistribute it or use it however you like.
                                     COUNT views
    --max-views COUNT                Do not download any videos with more than
                                     COUNT views
+    --match-filter FILTER            (Experimental) Generic video filter.
+                                     Specify any key (see help for -o for a list
+                                     of available keys) to match if the key is
+                                     present, !key to check if the key is not
+                                     present,key > NUMBER (like "comment_count >
+                                     12", also works with >=, <, <=, !=, =) to
+                                     compare against a number, and & to require
+                                     multiple matches. Values which are not
+                                     known are excluded unless you put a
+                                     question mark (?) after the operator.For
+                                     example, to only match videos that have
+                                     been liked more than 100 times and disliked
+                                     less than 50 times (or the dislike
+                                     functionality is not available at the given
+                                     service), but who also have a description,
+                                     use  --match-filter "like_count > 100 &
+                                     dislike_count <? 50 & description" .
    --no-playlist                    If the URL refers to a video and a
                                     playlist, download only the video.
    --age-limit YEARS                download only videos suitable for the given
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -53,6 +53,7 @@ from youtube_dl.utils import (
    version_tuple,
    xpath_with_ns,
    render_table,
+    match_str,
 )


@@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
            '123  4\n'
            '9999 51')

+    def test_match_str(self):
+        self.assertRaises(ValueError, match_str, 'xy>foobar', {})
+        self.assertFalse(match_str('xy', {'x': 1200}))
+        self.assertTrue(match_str('!xy', {'x': 1200}))
+        self.assertTrue(match_str('x', {'x': 1200}))
+        self.assertFalse(match_str('!x', {'x': 1200}))
+        self.assertTrue(match_str('x', {'x': 0}))
+        self.assertFalse(match_str('x>0', {'x': 0}))
+        self.assertFalse(match_str('x>0', {}))
+        self.assertTrue(match_str('x>?0', {}))
+        self.assertTrue(match_str('x>1K', {'x': 1200}))
+        self.assertFalse(match_str('x>2K', {'x': 1200}))
+        self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
+        self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
+        self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
+        self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
+        self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
+        self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 90, 'description': 'foo'}))
+        self.assertTrue(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'description': 'foo'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'dislike_count': 60, 'description': 'foo'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'dislike_count': 10}))
+

 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -228,6 +228,11 @@ class YoutubeDL(object):
    external_downloader:  Executable of the external downloader to call.
    listformats:       Print an overview of available video formats and exit.
    list_thumbnails:   Print a table of all thumbnails and exit.
+    match_filter:      A function that gets called with the info_dict of
+                       every video.
+                       If it returns a message, the video is ignored.
+                       If it returns None, the video is downloaded.
+                       match_filter_func in utils.py is one example for this.


    The following parameters are not used by YoutubeDL itself, they are used by
@@ -583,9 +588,16 @@ class YoutubeDL(object):
            if max_views is not None and view_count > max_views:
                return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
        if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
-            return 'Skipping "%s" because it is age restricted' % title
+            return 'Skipping "%s" because it is age restricted' % video_title
        if self.in_download_archive(info_dict):
            return '%s has already been recorded in archive' % video_title
+
+        match_filter = self.params.get('match_filter')
+        if match_filter is not None:
+            ret = match_filter(info_dict)
+            if ret is not None:
+                return ret
+
        return None

    @staticmethod
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@@ -23,9 +23,10 @@ from .compat import (
 )
 from .utils import (
    DateRange,
-    DEFAULT_OUTTMPL,
    decodeOption,
+    DEFAULT_OUTTMPL,
    DownloadError,
+    match_filter_func,
    MaxDownloadsReached,
    preferredencoding,
    read_batch_urls,
@@ -247,6 +248,9 @@ def _real_main(argv=None):
            xattr  # Confuse flake8
        except ImportError:
            parser.error('setting filesize xattr requested but python-xattr is not available')
+    match_filter = (
+        None if opts.match_filter is None
+        else match_filter_func(opts.match_filter))

    ydl_opts = {
        'usenetrc': opts.usenetrc,
@@ -344,6 +348,7 @@ def _real_main(argv=None):
        'list_thumbnails': opts.list_thumbnails,
        'playlist_items': opts.playlist_items,
        'xattr_set_filesize': opts.xattr_set_filesize,
+        'match_filter': match_filter,
    }

    with YoutubeDL(ydl_opts) as ydl:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -74,7 +74,7 @@ from .collegehumor import CollegeHumorIE
 from .collegerama import CollegeRamaIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
 from .comcarcoff import ComCarCoffIE
-from .commonmistakes import CommonMistakesIE
+from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .condenast import CondeNastIE
 from .cracked import CrackedIE
 from .criterion import CriterionIE
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -72,26 +72,29 @@ class BandcampIE(InfoExtractor):

        download_link = m_download.group(1)
        video_id = self._search_regex(
-            r'var TralbumData = {.*?id: (?P<id>\d+),?$',
-            webpage, 'video id', flags=re.MULTILINE | re.DOTALL)
+            r'(?ms)var TralbumData = {.*?id: (?P<id>\d+),?$',
+            webpage, 'video id')

        download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
        # We get the dictionary of the track from some javascript code
-        info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)
-        info = json.loads(info)[0]
+        all_info = self._parse_json(self._search_regex(
+            r'(?sm)items: (.*?),$', download_webpage, 'items'), video_id)
+        info = all_info[0]
        # We pick mp3-320 for now, until format selection can be easily implemented.
        mp3_info = info['downloads']['mp3-320']
        # If we try to use this url it says the link has expired
        initial_url = mp3_info['url']
-        re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
-        m_url = re.match(re_url, initial_url)
+        m_url = re.match(
+            r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$',
+            initial_url)
        # We build the url we will use to get the final track url
        # This url is build in Bandcamp in the script download_bunde_*.js
        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
        final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
        # If we could correctly generate the .rand field the url would be
        # in the "download_url" key
-        final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
+        final_url = self._search_regex(
+            r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL')

        return {
            'id': video_id,
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -264,8 +264,15 @@ class InfoExtractor(object):

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
-        self.initialize()
-        return self._real_extract(url)
+        try:
+            self.initialize()
+            return self._real_extract(url)
+        except ExtractorError:
+            raise
+        except compat_http_client.IncompleteRead as e:
+            raise ExtractorError('A network error has occured.', cause=e, expected=True)
+        except (KeyError,) as e:
+            raise ExtractorError('An extractor error has occured.', cause=e)

    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
--- a/youtube_dl/extractor/commonmistakes.py
+++ b/youtube_dl/extractor/commonmistakes.py
@@ -24,6 +24,23 @@ class CommonMistakesIE(InfoExtractor):
            'That doesn\'t make any sense. '
            'Simply remove the parameter in your command or configuration.'
        ) % url
-        if self._downloader.params.get('verbose'):
+        if not self._downloader.params.get('verbose'):
            msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.'
        raise ExtractorError(msg, expected=True)
+
+
+class UnicodeBOMIE(InfoExtractor):
+        IE_DESC = False
+        _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$'
+
+        _TESTS = [{
+            'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        }]
+
+        def _real_extract(self, url):
+            real_url = self._match_id(url)
+            self.report_warning(
+                'Your URL starts with a Byte Order Mark (BOM). '
+                'Removing the BOM and looking for "%s" ...' % real_url)
+            return self.url_result(real_url)
--- a/youtube_dl/extractor/svtplay.py
+++ b/youtube_dl/extractor/svtplay.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals

 from .common import InfoExtractor
@@ -10,13 +11,13 @@ class SVTPlayIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?svtplay\.se/video/(?P<id>[0-9]+)'
    _TEST = {
        'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
-        'md5': '2521cd644e862936cf2e698206e47385',
+        'md5': 'f4a184968bc9c802a9b41316657aaa80',
        'info_dict': {
-            'id': '3966754',
+            'id': '2609989',
            'ext': 'mp4',
-            'title': 'FIFA 14 - E3 2013 Trailer',
+            'title': 'SM veckan vinter, Örebro - Rally, final',
            'duration': 4500,
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': 're:^https?://.*[\.-]jpg$',
        },
    }

--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -244,6 +244,25 @@ def parseOpts(overrideArguments=None):
        '--max-views',
        metavar='COUNT', dest='max_views', default=None, type=int,
        help='Do not download any videos with more than COUNT views')
+    selection.add_option(
+        '--match-filter',
+        metavar='FILTER', dest='match_filter', default=None,
+        help=(
+            '(Experimental) Generic video filter. '
+            'Specify any key (see help for -o for a list of available keys) to'
+            ' match if the key is present, '
+            '!key to check if the key is not present,'
+            'key > NUMBER (like "comment_count > 12", also works with '
+            '>=, <, <=, !=, =) to compare against a number, and '
+            '& to require multiple matches. '
+            'Values which are not known are excluded unless you'
+            ' put a question mark (?) after the operator.'
+            'For example, to only match videos that have been liked more than '
+            '100 times and disliked less than 50 times (or the dislike '
+            'functionality is not available at the given service), but who '
+            'also have a description, use  --match-filter '
+            '"like_count > 100 & dislike_count <? 50 & description" .'
+        ))
    selection.add_option(
        '--no-playlist',
        action='store_true', dest='noplaylist', default=False,
@@ -734,22 +753,22 @@ def parseOpts(overrideArguments=None):
        if opts.verbose:
            write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')
    else:
-        commandLineConf = sys.argv[1:]
-        if '--ignore-config' in commandLineConf:
-            systemConf = []
-            userConf = []
+        command_line_conf = sys.argv[1:]
+        if '--ignore-config' in command_line_conf:
+            system_conf = []
+            user_conf = []
        else:
-            systemConf = _readOptions('/etc/youtube-dl.conf')
-            if '--ignore-config' in systemConf:
-                userConf = []
+            system_conf = _readOptions('/etc/youtube-dl.conf')
+            if '--ignore-config' in system_conf:
+                user_conf = []
            else:
-                userConf = _readUserConf()
-        argv = systemConf + userConf + commandLineConf
+                user_conf = _readUserConf()
+        argv = system_conf + user_conf + command_line_conf

        opts, args = parser.parse_args(argv)
        if opts.verbose:
-            write_string('[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
-            write_string('[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
-            write_string('[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
+            write_string('[debug] System config: ' + repr(_hide_login_info(system_conf)) + '\n')
+            write_string('[debug] User config: ' + repr(_hide_login_info(user_conf)) + '\n')
+            write_string('[debug] Command-line args: ' + repr(_hide_login_info(command_line_conf)) + '\n')

    return parser, opts, args
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -17,6 +17,7 @@ import io
 import json
 import locale
 import math
+import operator
 import os
 import pipes
 import platform
@@ -1678,3 +1679,79 @@ def render_table(header_row, data):
    max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
    format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
    return '\n'.join(format_str % tuple(row) for row in table)
+
+
+def _match_one(filter_part, dct):
+    COMPARISON_OPERATORS = {
+        '<': operator.lt,
+        '<=': operator.le,
+        '>': operator.gt,
+        '>=': operator.ge,
+        '=': operator.eq,
+        '!=': operator.ne,
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<key>[a-z_]+)
+        \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+        (?:
+            (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
+            (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
+        )
+        \s*$
+        ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = COMPARISON_OPERATORS[m.group('op')]
+        if m.group('strval') is not None:
+            if m.group('op') not in ('=', '!='):
+                raise ValueError(
+                    'Operator %s does not support string values!' % m.group('op'))
+            comparison_value = m.group('strval')
+        else:
+            try:
+                comparison_value = int(m.group('intval'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('intval'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('intval') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid integer value %r in filter part %r' % (
+                            m.group('intval'), filter_part))
+        actual_value = dct.get(m.group('key'))
+        if actual_value is None:
+            return m.group('none_inclusive')
+        return op(actual_value, comparison_value)
+
+    UNARY_OPERATORS = {
+        '': lambda v: v is not None,
+        '!': lambda v: v is None,
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<op>%s)\s*(?P<key>[a-z_]+)
+        \s*$
+        ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = UNARY_OPERATORS[m.group('op')]
+        actual_value = dct.get(m.group('key'))
+        return op(actual_value)
+
+    raise ValueError('Invalid filter part %r' % filter_part)
+
+
+def match_str(filter_str, dct):
+    """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
+
+    return all(
+        _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
+
+
+def match_filter_func(filter_str):
+    def _match_func(info_dict):
+        if match_str(filter_str, info_dict):
+            return None
+        else:
+            video_title = info_dict.get('title', info_dict.get('id', 'video'))
+            return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
+    return _match_func
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2015.02.09.3'
+__version__ = '2015.02.10.2'
Author	SHA1	Message	Date
Philipp Hagemeister	845734773d	release 2015.02.10.2	2015-02-10 03:32:55 +01:00
Philipp Hagemeister	347de4931c	[YoutubeDL] Add generic video filtering (Fixes #4916 ) This functionality is intended to eventually encompass the current format filtering.	2015-02-10 03:32:24 +01:00
Philipp Hagemeister	8829650513	release 2015.02.10.1	2015-02-10 01:46:09 +01:00
Philipp Hagemeister	c73fae1e2e	[commonmistakes] Detect BOMs at the beginning of URLs Reported at https://bugzilla.redhat.com/show_bug.cgi?id=1093517 .	2015-02-10 01:40:55 +01:00
Philipp Hagemeister	834bf069d2	[bandcamp] Correct variable name	2015-02-10 01:37:14 +01:00
Philipp Hagemeister	c06a9fa34f	Use snake_case instead of camelCase	2015-02-10 01:36:38 +01:00
Philipp Hagemeister	753fad4adc	[commonmistakes] Correct logic error	2015-02-10 01:34:01 +01:00
Philipp Hagemeister	34814eb66e	release 2015.02.10	2015-02-10 01:19:52 +01:00
Philipp Hagemeister	3a5bcd0326	[extractor/common] Wrap extractor errors (Fixes #1194 ) For now, we just wrap some common errors. More may follow. We do not want to catch actual programming errors in the extractors, such as 1 // 0.	2015-02-10 01:17:23 +01:00
Philipp Hagemeister	99c2398bc6	[bandcamp] Use our API to get more stable error messages (#1194 )	2015-02-09 19:08:51 +01:00
Philipp Hagemeister	28f1272870	[svtplay] Correct test case	2015-02-09 16:05:01 +01:00