mirror of
				https://github.com/ytdl-org/youtube-dl.git
				synced 2025-10-29 09:26:20 -07:00 
			
		
		
		
	Merge branch 'extract_info_rewrite'
This commit is contained in:
		| @@ -150,6 +150,8 @@ The `-o` option allows users to indicate a template for the output file names. T | ||||
|  - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). | ||||
|  - `epoch`: The sequence will be replaced by the Unix epoch when creating the file. | ||||
|  - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. | ||||
|  - `playlist`: The name or the id of the playlist that contains the video. | ||||
|  - `playlist_index`: The index of the video in the playlist, a five-digit number. | ||||
|  | ||||
| The current default template is `%(id)s.%(ext)s`, but that will be switchted to `%(title)s-%(id)s.%(ext)s` (which can be requested with `-t` at the moment). | ||||
|  | ||||
|   | ||||
| @@ -10,6 +10,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||
|  | ||||
| from youtube_dl.InfoExtractors import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE | ||||
| from youtube_dl.utils import * | ||||
| from youtube_dl.FileDownloader import FileDownloader | ||||
|  | ||||
| PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") | ||||
| with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: | ||||
| @@ -22,7 +23,7 @@ proxy_handler = compat_urllib_request.ProxyHandler() | ||||
| opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) | ||||
| compat_urllib_request.install_opener(opener) | ||||
|  | ||||
| class FakeDownloader(object): | ||||
| class FakeDownloader(FileDownloader): | ||||
|     def __init__(self): | ||||
|         self.result = [] | ||||
|         self.params = parameters | ||||
| @@ -30,35 +31,42 @@ class FakeDownloader(object): | ||||
|         print(s) | ||||
|     def trouble(self, s): | ||||
|         raise Exception(s) | ||||
|     def download(self, x): | ||||
|         self.result.append(x) | ||||
|     def extract_info(self, url): | ||||
|         self.result.append(url) | ||||
|         return url | ||||
|  | ||||
| class TestYoutubeLists(unittest.TestCase): | ||||
|     def assertIsPlaylist(self,info): | ||||
|         """Make sure the info has '_type' set to 'playlist'""" | ||||
|         self.assertEqual(info['_type'], 'playlist') | ||||
|  | ||||
|     def test_youtube_playlist(self): | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubePlaylistIE(dl) | ||||
|         ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') | ||||
|         ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result] | ||||
|         result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0] | ||||
|         self.assertIsPlaylist(result) | ||||
|         ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] | ||||
|         self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE']) | ||||
|  | ||||
|     def test_issue_673(self): | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubePlaylistIE(dl) | ||||
|         ie.extract('PLBB231211A4F62143') | ||||
|         self.assertTrue(len(dl.result) > 40) | ||||
|         result = ie.extract('PLBB231211A4F62143')[0] | ||||
|         self.assertTrue(len(result['entries']) > 40) | ||||
|  | ||||
|     def test_youtube_playlist_long(self): | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubePlaylistIE(dl) | ||||
|         ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') | ||||
|         self.assertTrue(len(dl.result) >= 799) | ||||
|         result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0] | ||||
|         self.assertIsPlaylist(result) | ||||
|         self.assertTrue(len(result['entries']) >= 799) | ||||
|  | ||||
|     def test_youtube_playlist_with_deleted(self): | ||||
|         #651 | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubePlaylistIE(dl) | ||||
|         ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') | ||||
|         ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result] | ||||
|         result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0] | ||||
|         ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] | ||||
|         self.assertFalse('pElCt5oNDuI' in ytie_results) | ||||
|         self.assertFalse('KdPEApIVdWM' in ytie_results) | ||||
|  | ||||
| @@ -66,10 +74,11 @@ class TestYoutubeLists(unittest.TestCase): | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubePlaylistIE(dl) | ||||
|         # TODO find a > 100 (paginating?) videos course | ||||
|         ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') | ||||
|         self.assertEqual(YoutubeIE()._extract_id(dl.result[0][0]), 'j9WZyLZCBzs') | ||||
|         self.assertEqual(len(dl.result), 25) | ||||
|         self.assertEqual(YoutubeIE()._extract_id(dl.result[-1][0]), 'rYefUsYuEp0') | ||||
|         result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0] | ||||
|         entries = result['entries'] | ||||
|         self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs') | ||||
|         self.assertEqual(len(entries), 25) | ||||
|         self.assertEqual(YoutubeIE()._extract_id(entries[-1]['url']), 'rYefUsYuEp0') | ||||
|  | ||||
|     def test_youtube_channel(self): | ||||
|         # I give up, please find a channel that does paginate and test this like test_youtube_playlist_long | ||||
| @@ -78,8 +87,8 @@ class TestYoutubeLists(unittest.TestCase): | ||||
|     def test_youtube_user(self): | ||||
|         dl = FakeDownloader() | ||||
|         ie = YoutubeUserIE(dl) | ||||
|         ie.extract('https://www.youtube.com/user/TheLinuxFoundation') | ||||
|         self.assertTrue(len(dl.result) >= 320) | ||||
|         result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0] | ||||
|         self.assertTrue(len(result['entries']) >= 320) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|   | ||||
| @@ -393,6 +393,8 @@ class FileDownloader(object): | ||||
|                 autonumber_size = 5 | ||||
|             autonumber_templ = u'%0' + str(autonumber_size) + u'd' | ||||
|             template_dict['autonumber'] = autonumber_templ % self._num_downloads | ||||
|             if template_dict['playlist_index'] is not None: | ||||
|                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] | ||||
|  | ||||
|             sanitize = lambda k,v: sanitize_filename( | ||||
|                 u'NA' if v is None else compat_str(v), | ||||
| @@ -423,9 +425,109 @@ class FileDownloader(object): | ||||
|                 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' | ||||
|         return None | ||||
|          | ||||
|     def extract_info(self, url, download = True): | ||||
|         ''' | ||||
|         Returns a list with a dictionary for each video we find. | ||||
|         If 'download', also downloads the videos. | ||||
|          ''' | ||||
|         suitable_found = False | ||||
|         for ie in self._ies: | ||||
|             # Go to next InfoExtractor if not suitable | ||||
|             if not ie.suitable(url): | ||||
|                 continue | ||||
|  | ||||
|             # Warn if the _WORKING attribute is False | ||||
|             if not ie.working(): | ||||
|                 self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, ' | ||||
|                                u'and will probably not work. If you want to go on, use the -i option.') | ||||
|  | ||||
|             # Suitable InfoExtractor found | ||||
|             suitable_found = True | ||||
|  | ||||
|             # Extract information from URL and process it | ||||
|             try: | ||||
|                 ie_results = ie.extract(url) | ||||
|                 results = [] | ||||
|                 for ie_result in ie_results: | ||||
|                     if not 'extractor' in ie_result: | ||||
|                         #The extractor has already been set somewhere else | ||||
|                         ie_result['extractor'] = ie.IE_NAME | ||||
|                     results.append(self.process_ie_result(ie_result, download)) | ||||
|                 return results | ||||
|             except ExtractorError as de: # An error we somewhat expected | ||||
|                 self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback()) | ||||
|                 break | ||||
|             except Exception as e: | ||||
|                 if self.params.get('ignoreerrors', False): | ||||
|                     self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc())) | ||||
|                     break | ||||
|                 else: | ||||
|                     raise | ||||
|         if not suitable_found: | ||||
|                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) | ||||
|          | ||||
|     def process_ie_result(self, ie_result, download = True): | ||||
|         """ | ||||
|         Take the result of the ie and return a list of videos. | ||||
|         For url elements it will search the suitable ie and get the videos | ||||
|         For playlist elements it will process each of the elements of the 'entries' key | ||||
|          | ||||
|         It will also download the videos if 'download'. | ||||
|         """ | ||||
|         result_type = ie_result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system | ||||
|         if result_type == 'video': | ||||
|             if 'playlist' not in ie_result: | ||||
|                 #It isn't part of a playlist | ||||
|                 ie_result['playlist'] = None | ||||
|                 ie_result['playlist_index'] = None | ||||
|             if download: | ||||
|                 #Do the download: | ||||
|                 self.process_info(ie_result) | ||||
|             return ie_result | ||||
|         elif result_type == 'url': | ||||
|             #We get the video pointed by the url | ||||
|             result = self.extract_info(ie_result['url'], download)[0] | ||||
|             return result | ||||
|         elif result_type == 'playlist': | ||||
|             #We process each entry in the playlist | ||||
|             playlist = ie_result.get('title', None) or ie_result.get('id', None) | ||||
|             self.to_screen(u'[download] Downloading playlist: %s'  % playlist) | ||||
|  | ||||
|             playlist_results = [] | ||||
|  | ||||
|             n_all_entries = len(ie_result['entries']) | ||||
|             playliststart = self.params.get('playliststart', 1) - 1 | ||||
|             playlistend = self.params.get('playlistend', -1) | ||||
|  | ||||
|             if playlistend == -1: | ||||
|                 entries = ie_result['entries'][playliststart:] | ||||
|             else: | ||||
|                 entries = ie_result['entries'][playliststart:playlistend] | ||||
|  | ||||
|             n_entries = len(entries) | ||||
|  | ||||
|             self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % | ||||
|                 (ie_result['extractor'], playlist, n_all_entries, n_entries)) | ||||
|  | ||||
|             for i,entry in enumerate(entries,1): | ||||
|                 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries)) | ||||
|                 entry_result = self.process_ie_result(entry, False) | ||||
|                 entry_result['playlist'] = playlist | ||||
|                 entry_result['playlist_index'] = i + playliststart | ||||
|                 #We must do the download here to correctly set the 'playlist' key | ||||
|                 if download: | ||||
|                     self.process_info(entry_result) | ||||
|                 playlist_results.append(entry_result) | ||||
|             result = ie_result.copy() | ||||
|             result['entries'] = playlist_results | ||||
|             return result | ||||
|  | ||||
|     def process_info(self, info_dict): | ||||
|         """Process a single dictionary returned by an InfoExtractor.""" | ||||
|  | ||||
|         #We increment the download the download count here to match the previous behaviour. | ||||
|         self.increment_downloads() | ||||
|          | ||||
|         info_dict['fulltitle'] = info_dict['title'] | ||||
|         if len(info_dict['title']) > 200: | ||||
|             info_dict['title'] = info_dict['title'][:197] + u'...' | ||||
| @@ -564,53 +666,14 @@ class FileDownloader(object): | ||||
|             raise SameFileError(self.params['outtmpl']) | ||||
|  | ||||
|         for url in url_list: | ||||
|             suitable_found = False | ||||
|             for ie in self._ies: | ||||
|                 # Go to next InfoExtractor if not suitable | ||||
|                 if not ie.suitable(url): | ||||
|                     continue | ||||
|  | ||||
|                 # Warn if the _WORKING attribute is False | ||||
|                 if not ie.working(): | ||||
|                     self.report_warning(u'the program functionality for this site has been marked as broken, ' | ||||
|                                         u'and will probably not work. If you want to go on, use the -i option.') | ||||
|  | ||||
|                 # Suitable InfoExtractor found | ||||
|                 suitable_found = True | ||||
|  | ||||
|                 # Extract information from URL and process it | ||||
|                 try: | ||||
|                     videos = ie.extract(url) | ||||
|                 except ExtractorError as de: # An error we somewhat expected | ||||
|                     self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback()) | ||||
|                     break | ||||
|                 except MaxDownloadsReached: | ||||
|                     self.to_screen(u'[info] Maximum number of downloaded files reached.') | ||||
|                     raise | ||||
|                 except Exception as e: | ||||
|                     if self.params.get('ignoreerrors', False): | ||||
|                         self.report_error(u'' + compat_str(e), tb=compat_str(traceback.format_exc())) | ||||
|                         break | ||||
|                     else: | ||||
|                         raise | ||||
|  | ||||
|                 if len(videos or []) > 1 and self.fixed_template(): | ||||
|                     raise SameFileError(self.params['outtmpl']) | ||||
|  | ||||
|                 for video in videos or []: | ||||
|                     video['extractor'] = ie.IE_NAME | ||||
|                     try: | ||||
|                         self.increment_downloads() | ||||
|                         self.process_info(video) | ||||
|                     except UnavailableVideoError: | ||||
|                         self.to_stderr(u"\n") | ||||
|                         self.report_error(u'unable to download video') | ||||
|  | ||||
|                 # Suitable InfoExtractor had been found; go to next URL | ||||
|                 break | ||||
|  | ||||
|             if not suitable_found: | ||||
|                 self.report_error(u'no suitable InfoExtractor: %s' % url) | ||||
|             try: | ||||
|                 #It also downloads the videos | ||||
|                 videos = self.extract_info(url) | ||||
|             except UnavailableVideoError: | ||||
|                 self.trouble(u'\nERROR: unable to download video') | ||||
|             except MaxDownloadsReached: | ||||
|                 self.to_screen(u'[info] Maximum number of downloaded files reached.') | ||||
|                 raise | ||||
|  | ||||
|         return self._download_retcode | ||||
|  | ||||
|   | ||||
| @@ -144,6 +144,28 @@ class InfoExtractor(object): | ||||
|             self._downloader.to_screen(dump) | ||||
|         return webpage_bytes.decode(encoding, 'replace') | ||||
|          | ||||
|     #Methods for following #608 | ||||
|     #They set the correct value of the '_type' key | ||||
|     def video_result(self, video_info): | ||||
|         """Returns a video""" | ||||
|         video_info['_type'] = 'video' | ||||
|         return video_info | ||||
|     def url_result(self, url, ie=None): | ||||
|         """Returns a url that points to a page that should be processed""" | ||||
|         #TODO: ie should be the class used for getting the info | ||||
|         video_info = {'_type': 'url', | ||||
|                       'url': url} | ||||
|         return video_info | ||||
|     def playlist_result(self, entries, playlist_id=None, playlist_title=None): | ||||
|         """Returns a playlist""" | ||||
|         video_info = {'_type': 'playlist', | ||||
|                       'entries': entries} | ||||
|         if playlist_id: | ||||
|             video_info['id'] = playlist_id | ||||
|         if playlist_title: | ||||
|             video_info['title'] = playlist_title | ||||
|         return video_info | ||||
|  | ||||
|  | ||||
| class YoutubeIE(InfoExtractor): | ||||
|     """Information extractor for youtube.com.""" | ||||
| @@ -706,8 +728,7 @@ class MetacafeIE(InfoExtractor): | ||||
|         # Check if video comes from YouTube | ||||
|         mobj2 = re.match(r'^yt-(.*)$', video_id) | ||||
|         if mobj2 is not None: | ||||
|             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) | ||||
|             return | ||||
|             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))] | ||||
|  | ||||
|         # Retrieve video webpage to extract further information | ||||
|         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) | ||||
| @@ -1348,7 +1369,7 @@ class GenericIE(InfoExtractor): | ||||
|         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) | ||||
|  | ||||
|     def _test_redirect(self, url): | ||||
|         """Check if it is a redirect, like url shorteners, in case restart chain.""" | ||||
|         """Check if it is a redirect, like url shorteners, in case return the new url.""" | ||||
|         class HeadRequest(compat_urllib_request.Request): | ||||
|             def get_method(self): | ||||
|                 return "HEAD" | ||||
| @@ -1399,11 +1420,11 @@ class GenericIE(InfoExtractor): | ||||
|             return False | ||||
|  | ||||
|         self.report_following_redirect(new_url) | ||||
|         self._downloader.download([new_url]) | ||||
|         return True | ||||
|         return new_url | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         if self._test_redirect(url): return | ||||
|         new_url = self._test_redirect(url) | ||||
|         if new_url: return [self.url_result(new_url)] | ||||
|  | ||||
|         video_id = url.split('/')[-1] | ||||
|         try: | ||||
| @@ -1794,23 +1815,9 @@ class YoutubePlaylistIE(InfoExtractor): | ||||
|             page_num += 1 | ||||
|  | ||||
|         videos = [v[1] for v in sorted(videos)] | ||||
|         total = len(videos) | ||||
|  | ||||
|         playliststart = self._downloader.params.get('playliststart', 1) - 1 | ||||
|         playlistend = self._downloader.params.get('playlistend', -1) | ||||
|         if playlistend == -1: | ||||
|             videos = videos[playliststart:] | ||||
|         else: | ||||
|             videos = videos[playliststart:playlistend] | ||||
|  | ||||
|         if len(videos) == total: | ||||
|             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) | ||||
|         else: | ||||
|             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) | ||||
|  | ||||
|         for video in videos: | ||||
|             self._downloader.download([video]) | ||||
|         return | ||||
|         url_results = [self.url_result(url) for url in videos] | ||||
|         return [self.playlist_result(url_results, playlist_id)] | ||||
|  | ||||
|  | ||||
| class YoutubeChannelIE(InfoExtractor): | ||||
| @@ -1860,9 +1867,9 @@ class YoutubeChannelIE(InfoExtractor): | ||||
|  | ||||
|         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) | ||||
|  | ||||
|         for id in video_ids: | ||||
|             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) | ||||
|         return | ||||
|         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] | ||||
|         url_entries = [self.url_result(url) for url in urls] | ||||
|         return [self.playlist_result(url_entries, channel_id)] | ||||
|  | ||||
|  | ||||
| class YoutubeUserIE(InfoExtractor): | ||||
| @@ -1932,20 +1939,9 @@ class YoutubeUserIE(InfoExtractor): | ||||
|  | ||||
|             pagenum += 1 | ||||
|  | ||||
|         all_ids_count = len(video_ids) | ||||
|         playliststart = self._downloader.params.get('playliststart', 1) - 1 | ||||
|         playlistend = self._downloader.params.get('playlistend', -1) | ||||
|  | ||||
|         if playlistend == -1: | ||||
|             video_ids = video_ids[playliststart:] | ||||
|         else: | ||||
|             video_ids = video_ids[playliststart:playlistend] | ||||
|  | ||||
|         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % | ||||
|                 (username, all_ids_count, len(video_ids))) | ||||
|  | ||||
|         for video_id in video_ids: | ||||
|             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) | ||||
|         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] | ||||
|         url_results = [self.url_result(url) for url in urls] | ||||
|         return [self.playlist_result(url_results, playlist_title = username)] | ||||
|  | ||||
|  | ||||
| class BlipTVUserIE(InfoExtractor): | ||||
| @@ -2023,20 +2019,12 @@ class BlipTVUserIE(InfoExtractor): | ||||
|  | ||||
|             pagenum += 1 | ||||
|  | ||||
|         all_ids_count = len(video_ids) | ||||
|         playliststart = self._downloader.params.get('playliststart', 1) - 1 | ||||
|         playlistend = self._downloader.params.get('playlistend', -1) | ||||
|  | ||||
|         if playlistend == -1: | ||||
|             video_ids = video_ids[playliststart:] | ||||
|         else: | ||||
|             video_ids = video_ids[playliststart:playlistend] | ||||
|  | ||||
|         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % | ||||
|                 (self.IE_NAME, username, all_ids_count, len(video_ids))) | ||||
|  | ||||
|         for video_id in video_ids: | ||||
|             self._downloader.download([u'http://blip.tv/'+video_id]) | ||||
|         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] | ||||
|         url_entries = [self.url_result(url) for url in urls] | ||||
|         return [self.playlist_result(url_entries, playlist_title = username)] | ||||
|  | ||||
|  | ||||
| class DepositFilesIE(InfoExtractor): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user