zeit-dl/zeit-dl

127 lines
4.6 KiB
Python
Executable file

#!/usr/bin/env python
import os
import sys
import re
from datetime import datetime
import argparse
from urllib.parse import urlparse
from playwright.sync_api import sync_playwright
EPAPER = 'https://epaper.zeit.de'
MEINE = 'https://meine.zeit.de'
def log(*texts, args=None, level='INFO'):
if args is None or not args.quiet:
print(level+':', *texts)
def main():
ret = 0
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('user', type=str, help='Username')
parser.add_argument('pwd', type=str, help='Password')
parser.add_argument('-o', '--out', type=str, default=os.getcwd(), help='Output directory')
parser.add_argument('-a', '--abo', type=str, choices=['diezeit', 'zeitcampus', 'zeit-audio'], default='diezeit', help='Subscription (part after abo/)')
parser.add_argument('-i', '--issue', type=str, help='Issue (mostly DD.MM.YYYY)')
parser.add_argument('-t', '--types', type=str, nargs='*', choices=['pdf', 'epub', 'mp3'], default=['pdf'], help='File type')
parser.add_argument('-f', '--force', action='store_true', help='Redownload file even if already present')
parser.add_argument('--format', type=str, default="{abo}_{issue}.{ext}", help='Filename format. Possible formatting strings are {abo}, {issue}, {ext} and datetime format codes.')
parser.add_argument('-q', '--quiet', action='store_true', help='No output except for filename if written.')
args = parser.parse_args()
if args.user is None or args.pwd is None:
parser.error('You need to supply a username and password')
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(MEINE+'/anmelden')
page.locator("input[id='login_email']").fill(args.user)
page.locator("input[id='login_pass']").fill(args.pwd)
try:
page.locator("input[type='submit']").click()
if page.url == MEINE+'/anmelden':
log('could not login', args=args, level='ERROR')
ret = 1
else:
ret = download(page, args=args)
except Exception as e:
page.goto(MEINE+'/abmelden')
log(e, args=args, level='ERROR')
ret = 1
finally:
page.goto(MEINE+'/abmelden')
log('finally: logout', args=args)
browser.close()
return ret
def download(page, args):
downloads = [] # (url, type)
if args.abo == 'zeit-audio':
log('not yet implemented', args=args, level='ERROR')
return 1
else:
if args.issue is None:
page.goto(EPAPER+'/abo/'+args.abo)
page.locator("div.epaper-highlighted > a.btn").click(force=True)
issue = page.url.split('/')[-1]
else:
page.goto(EPAPER+'/abo/'+args.abo+'/'+args.issue)
issue = args.issue
try:
published = datetime.strptime(issue, '%d.%m.%Y')
except ValueError:
published = None
dl_btns = page.locator("div.download-buttons > a.btn").all()
url = None
for btn in dl_btns:
js_obj = btn.get_attribute('data-wt-click')
match = re.search(r"9: ?'([^']*)'", js_obj)
type = match.group(1)
if type in args.types:
log(js_obj, args=args)
url = btn.get_attribute('href')
o = urlparse(url)
if o.netloc == '':
url = EPAPER+url
if url is not None:
downloads.append((url, type))
if len(downloads) == 0:
log('Could not find appropriate button for', args.types, args=args, level='ERROR')
return 1
for url, type in downloads:
filename = args.format.format(abo=args.abo, issue=issue, ext=type)
if published is not None:
filename = published.strftime(filename)
filepath = os.path.join(args.out, filename)
if os.path.isfile(filepath):
log('File already exists:', filepath, args=args, level='WARNING')
if args.force:
log('Continuing...', args=args)
else:
log('Aborting...', args=args)
continue
file = page.context.request.get(url)
if file.headers['content-type'] != 'text/html':
if args.quiet:
print(filepath)
else:
log('Downloading {}...'.format(filename), args=args)
with open(filepath, 'wb') as f:
f.write(file.body())
return 0
if __name__ == "__main__":
sys.exit(main())