Automatically remove background image.

Migrated from PyPDF4 to pypdf.
2023-12-06 17:20:44 +01:00 · 2023-12-06 17:20:44 +01:00 · 403d8a8025
commit 403d8a8025
parent 2f6d487676
2 changed files with 38 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -6,18 +6,18 @@ This script uses your login credentials.
 If the script is run multiple times without terminating properly (before logout procedure) the Übungsgruppensystem might block you for some hours.
 Use at your own risk.

+The script might also modify the slides such that content might go missing.
+
 ## Installation
-Requires `playwright` (with `chromium` driver) and `PyPDF4`:
+Requires `playwright` (with `chromium` driver) and `pypdf`:
 ```sh
-pip install playwright PyPDF4
+pip install playwright pypdf
 playwright install chromium
 ```
 Then you just need to download the [`pep5-dl`](https://git.haagfank.de/LnLcFlx/pep5-dl/raw/branch/master/pep5-dl) file from this repository.

-### (Without `PyPDF4`)
-If you are on Linux and have `pdfunite` installed, you can alternatively supply `--merger=pdfunite` and do not need `PyPDF4`.
-
-Alternatively you can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs to `<DIR>`.
+### (Without `pypdf`)
+You can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs without any changes to `<DIR>`.

 ## Usage
 If you simply want to download all current slides into one file `slides.pdf` in the current directory, run
@ -26,4 +26,6 @@ python pep5-dl '<USERNAME>' '<PASSWORD>'
 ```
 where your have to replace `<USERNAME>` and `<PASSWORD>` with your credentials.

+By default the script removes the background image.
+If this leads to other missing images supply `--keepbg` to keep the background.
 For help and more options run `pep5-dl --help`.
--- a/38
+++ b/38
@ -9,8 +9,32 @@ from playwright.sync_api import sync_playwright
 BASE = 'https://uebungen.physik.uni-heidelberg.de'
 LOGIN = BASE+'/uebungen/login.php'
 MATERIAL = BASE+'/c/image/d/vorlesung/20232/1735/material/'
-
 DEFAULT = os.path.join(os.getcwd(), 'slides.pdf')
+BG_IMAGES = ['/Im'+str(i) for i in range(1, 5)]
+BG_DIMS = [(1499, 1024)]
+
+
+def pypdf_merge(pdfs, out, removebg=False):
+    from pypdf import PdfWriter
+    merger = PdfWriter()
+    for pdf in pdfs:
+        merger.append(pdf)
+
+    if removebg:
+        print('Removing background...')
+        for pnum, page in enumerate(merger.pages):
+            keys = list(page['/Resources']['/XObject'].keys())
+            for key in keys:
+                if key.startswith('/Im'):
+                    o = page['/Resources']['/XObject'][key]
+                    if key in BG_IMAGES:
+                        del page['/Resources']['/XObject'][key]
+                    elif '/Width' in o and '/Height' in o:
+                        if (o['/Width'], o['/Height']) in BG_DIMS:
+                            del page['/Resources']['/XObject'][key]
+
+    merger.write(out)
+    merger.close()


 def main():
@ -22,6 +46,7 @@ def main():
    parser.add_argument('-k', '--keep', action='store_true', help='Keep temporary files in TMPDIR')
    parser.add_argument('-t', '--tmpdir', type=str, default=tempfile.gettempdir(), help='Temporary directory')
    parser.add_argument('-m', '--merger', type=str, default='pypdf', choices=['pypdf', 'pdfunite', 'none'], help='Method used for merging PDFs')
+    parser.add_argument('--keepbg', action='store_true', help='Keep background images of slides (in case of bugs)')
    args = parser.parse_args()
    if args.user is None or args.pwd is None:
        parser.error('You need to supply a username and password')
@ -42,7 +67,7 @@ def main():
                i = 1
                while True:
                    name = 'PEP5_{:02d}.pdf'.format(i)
-                    if not os.path.isfile(name) or args.all:
+                    if not os.path.isfile(os.path.join(args.tmpdir, name)) or args.force:
                        pdf = page.context.request.get(MATERIAL+name)
                        if pdf.headers['content-type'] == 'application/pdf':
                            print('Downloading {}...'.format(name))
@ -64,13 +89,10 @@ def main():
    gl = glob.glob(glexpr)

    if args.out is not None and args.out != '':
+        if args.merger != 'none':
+            print('Merging PDFs...')
        if args.merger == 'pypdf':
-            from PyPDF4 import PdfMerger
-            merger = PdfMerger()
-            for pdf in gl:
-                merger.append(pdf)
-            merger.write(args.out)
-            merger.close()
+            pypdf_merge(gl, args.out, not args.keepbg)
        elif args.merger == 'pdfunite':
            from subprocess import Popen
            p = Popen('pdfunite {} {}'.format(glexpr, args.out), shell=True)