From 403d8a8025a4500e13429266a8ef291d9709cf6c Mon Sep 17 00:00:00 2001 From: Leon Haag-Fank Date: Wed, 6 Dec 2023 17:20:44 +0100 Subject: [PATCH] Automatically remove background image. Migrated from PyPDF4 to pypdf. --- README.md | 14 ++++++++------ pep5-dl | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 2b3d713..6604bd8 100644 --- a/README.md +++ b/README.md @@ -6,18 +6,18 @@ This script uses your login credentials. If the script is run multiple times without terminating properly (before logout procedure) the Übungsgruppensystem might block you for some hours. Use at your own risk. +The script might also modify the slides such that content might go missing. + ## Installation -Requires `playwright` (with `chromium` driver) and `PyPDF4`: +Requires `playwright` (with `chromium` driver) and `pypdf`: ```sh -pip install playwright PyPDF4 +pip install playwright pypdf playwright install chromium ``` Then you just need to download the [`pep5-dl`](https://git.haagfank.de/LnLcFlx/pep5-dl/raw/branch/master/pep5-dl) file from this repository. -### (Without `PyPDF4`) -If you are on Linux and have `pdfunite` installed, you can alternatively supply `--merger=pdfunite` and do not need `PyPDF4`. - -Alternatively you can supply `--merger=none --keep --tmpdir=` and simply download the individual PDFs to ``. +### (Without `pypdf`) +You can supply `--merger=none --keep --tmpdir=` and simply download the individual PDFs without any changes to ``. ## Usage If you simply want to download all current slides into one file `slides.pdf` in the current directory, run @@ -26,4 +26,6 @@ python pep5-dl '' '' ``` where your have to replace `` and `` with your credentials. +By default the script removes the background image. +If this leads to other missing images supply `--keepbg` to keep the background. For help and more options run `pep5-dl --help`. diff --git a/pep5-dl b/pep5-dl index ec0d233..b2a7606 100755 --- a/pep5-dl +++ b/pep5-dl @@ -9,8 +9,32 @@ from playwright.sync_api import sync_playwright BASE = 'https://uebungen.physik.uni-heidelberg.de' LOGIN = BASE+'/uebungen/login.php' MATERIAL = BASE+'/c/image/d/vorlesung/20232/1735/material/' - DEFAULT = os.path.join(os.getcwd(), 'slides.pdf') +BG_IMAGES = ['/Im'+str(i) for i in range(1, 5)] +BG_DIMS = [(1499, 1024)] + + +def pypdf_merge(pdfs, out, removebg=False): + from pypdf import PdfWriter + merger = PdfWriter() + for pdf in pdfs: + merger.append(pdf) + + if removebg: + print('Removing background...') + for pnum, page in enumerate(merger.pages): + keys = list(page['/Resources']['/XObject'].keys()) + for key in keys: + if key.startswith('/Im'): + o = page['/Resources']['/XObject'][key] + if key in BG_IMAGES: + del page['/Resources']['/XObject'][key] + elif '/Width' in o and '/Height' in o: + if (o['/Width'], o['/Height']) in BG_DIMS: + del page['/Resources']['/XObject'][key] + + merger.write(out) + merger.close() def main(): @@ -22,6 +46,7 @@ def main(): parser.add_argument('-k', '--keep', action='store_true', help='Keep temporary files in TMPDIR') parser.add_argument('-t', '--tmpdir', type=str, default=tempfile.gettempdir(), help='Temporary directory') parser.add_argument('-m', '--merger', type=str, default='pypdf', choices=['pypdf', 'pdfunite', 'none'], help='Method used for merging PDFs') + parser.add_argument('--keepbg', action='store_true', help='Keep background images of slides (in case of bugs)') args = parser.parse_args() if args.user is None or args.pwd is None: parser.error('You need to supply a username and password') @@ -42,7 +67,7 @@ def main(): i = 1 while True: name = 'PEP5_{:02d}.pdf'.format(i) - if not os.path.isfile(name) or args.all: + if not os.path.isfile(os.path.join(args.tmpdir, name)) or args.force: pdf = page.context.request.get(MATERIAL+name) if pdf.headers['content-type'] == 'application/pdf': print('Downloading {}...'.format(name)) @@ -64,13 +89,10 @@ def main(): gl = glob.glob(glexpr) if args.out is not None and args.out != '': + if args.merger != 'none': + print('Merging PDFs...') if args.merger == 'pypdf': - from PyPDF4 import PdfMerger - merger = PdfMerger() - for pdf in gl: - merger.append(pdf) - merger.write(args.out) - merger.close() + pypdf_merge(gl, args.out, not args.keepbg) elif args.merger == 'pdfunite': from subprocess import Popen p = Popen('pdfunite {} {}'.format(glexpr, args.out), shell=True)