diff --git a/zeit-dl b/zeit-dl index 8368e8e..9dd021e 100755 --- a/zeit-dl +++ b/zeit-dl @@ -5,6 +5,7 @@ import sys import re from datetime import datetime import argparse +from urllib.parse import urlparse from playwright.sync_api import sync_playwright EPAPER = 'https://epaper.zeit.de' @@ -25,7 +26,7 @@ def main(): parser.add_argument('-o', '--out', type=str, default=os.getcwd(), help='Output directory') parser.add_argument('-a', '--abo', type=str, choices=['diezeit', 'zeitcampus', 'zeit-audio'], default='diezeit', help='Subscription (part after abo/)') parser.add_argument('-i', '--issue', type=str, help='Issue (mostly DD.MM.YYYY)') - parser.add_argument('-t', '--type', type=str, choices=['pdf', 'epub', 'mp3'], default='pdf', help='File type') + parser.add_argument('-t', '--types', type=str, nargs='*', choices=['pdf', 'epub', 'mp3'], default=['pdf'], help='File type') parser.add_argument('-f', '--force', action='store_true', help='Redownload file even if already present') parser.add_argument('--format', type=str, default="{abo}_{issue}.{ext}", help='Filename format. Possible formatting strings are {abo}, {issue}, {ext} and datetime format codes.') parser.add_argument('-q', '--quiet', action='store_true', help='No output except for filename if written.') @@ -59,6 +60,8 @@ def main(): def download(page, args): + downloads = [] # (url, type) + if args.abo == 'zeit-audio': log('not yet implemented', args=args, level='ERROR') return 1 @@ -70,12 +73,32 @@ def download(page, args): else: page.goto(EPAPER+'/abo/'+args.abo+'/'+args.issue) issue = args.issue - try: published = datetime.strptime(issue, '%d.%m.%Y') except ValueError: published = None - filename = args.format.format(abo=args.abo, issue=issue, ext=args.type) + + dl_btns = page.locator("div.download-buttons > a.btn").all() + url = None + for btn in dl_btns: + js_obj = btn.get_attribute('data-wt-click') + match = re.search(r"9: ?'([^']*)'", js_obj) + type = match.group(1) + if type in args.types: + log(js_obj, args=args) + url = btn.get_attribute('href') + o = urlparse(url) + if o.netloc == '': + url = EPAPER+url + if url is not None: + downloads.append((url, type)) + + if len(downloads) == 0: + log('Could not find appropriate button for', args.types, args=args, level='ERROR') + return 1 + + for url, type in downloads: + filename = args.format.format(abo=args.abo, issue=issue, ext=type) if published is not None: filename = published.strftime(filename) @@ -86,20 +109,7 @@ def download(page, args): log('Continuing...', args=args) else: log('Aborting...', args=args) - return 0 - - dl_btns = page.locator("div.download-buttons > a.btn").all() - url = None - for btn in dl_btns: - js_obj = btn.get_attribute('data-wt-click') - match = re.search(r"9: ?'([^']*)'", js_obj) - if match.group(1) == args.type: - log(js_obj, args=args) - url = EPAPER+btn.get_attribute('href') continue - if url is None: - log('Could not find appropriate button for', args.type, args=args, level='ERROR') - return 1 file = page.context.request.get(url) if file.headers['content-type'] != 'text/html':