Automatically remove background image.

Migrated from PyPDF4 to pypdf.
This commit is contained in:
Leon Haag-Fank 2023-12-06 17:20:44 +01:00
parent 2f6d487676
commit 403d8a8025
2 changed files with 38 additions and 14 deletions

View file

@ -6,18 +6,18 @@ This script uses your login credentials.
If the script is run multiple times without terminating properly (before logout procedure) the Übungsgruppensystem might block you for some hours.
Use at your own risk.
The script might also modify the slides such that content might go missing.
## Installation
Requires `playwright` (with `chromium` driver) and `PyPDF4`:
Requires `playwright` (with `chromium` driver) and `pypdf`:
```sh
pip install playwright PyPDF4
pip install playwright pypdf
playwright install chromium
```
Then you just need to download the [`pep5-dl`](https://git.haagfank.de/LnLcFlx/pep5-dl/raw/branch/master/pep5-dl) file from this repository.
### (Without `PyPDF4`)
If you are on Linux and have `pdfunite` installed, you can alternatively supply `--merger=pdfunite` and do not need `PyPDF4`.
Alternatively you can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs to `<DIR>`.
### (Without `pypdf`)
You can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs without any changes to `<DIR>`.
## Usage
If you simply want to download all current slides into one file `slides.pdf` in the current directory, run
@ -26,4 +26,6 @@ python pep5-dl '<USERNAME>' '<PASSWORD>'
```
where your have to replace `<USERNAME>` and `<PASSWORD>` with your credentials.
By default the script removes the background image.
If this leads to other missing images supply `--keepbg` to keep the background.
For help and more options run `pep5-dl --help`.

38
pep5-dl
View file

@ -9,8 +9,32 @@ from playwright.sync_api import sync_playwright
BASE = 'https://uebungen.physik.uni-heidelberg.de'
LOGIN = BASE+'/uebungen/login.php'
MATERIAL = BASE+'/c/image/d/vorlesung/20232/1735/material/'
DEFAULT = os.path.join(os.getcwd(), 'slides.pdf')
BG_IMAGES = ['/Im'+str(i) for i in range(1, 5)]
BG_DIMS = [(1499, 1024)]
def pypdf_merge(pdfs, out, removebg=False):
from pypdf import PdfWriter
merger = PdfWriter()
for pdf in pdfs:
merger.append(pdf)
if removebg:
print('Removing background...')
for pnum, page in enumerate(merger.pages):
keys = list(page['/Resources']['/XObject'].keys())
for key in keys:
if key.startswith('/Im'):
o = page['/Resources']['/XObject'][key]
if key in BG_IMAGES:
del page['/Resources']['/XObject'][key]
elif '/Width' in o and '/Height' in o:
if (o['/Width'], o['/Height']) in BG_DIMS:
del page['/Resources']['/XObject'][key]
merger.write(out)
merger.close()
def main():
@ -22,6 +46,7 @@ def main():
parser.add_argument('-k', '--keep', action='store_true', help='Keep temporary files in TMPDIR')
parser.add_argument('-t', '--tmpdir', type=str, default=tempfile.gettempdir(), help='Temporary directory')
parser.add_argument('-m', '--merger', type=str, default='pypdf', choices=['pypdf', 'pdfunite', 'none'], help='Method used for merging PDFs')
parser.add_argument('--keepbg', action='store_true', help='Keep background images of slides (in case of bugs)')
args = parser.parse_args()
if args.user is None or args.pwd is None:
parser.error('You need to supply a username and password')
@ -42,7 +67,7 @@ def main():
i = 1
while True:
name = 'PEP5_{:02d}.pdf'.format(i)
if not os.path.isfile(name) or args.all:
if not os.path.isfile(os.path.join(args.tmpdir, name)) or args.force:
pdf = page.context.request.get(MATERIAL+name)
if pdf.headers['content-type'] == 'application/pdf':
print('Downloading {}...'.format(name))
@ -64,13 +89,10 @@ def main():
gl = glob.glob(glexpr)
if args.out is not None and args.out != '':
if args.merger != 'none':
print('Merging PDFs...')
if args.merger == 'pypdf':
from PyPDF4 import PdfMerger
merger = PdfMerger()
for pdf in gl:
merger.append(pdf)
merger.write(args.out)
merger.close()
pypdf_merge(gl, args.out, not args.keepbg)
elif args.merger == 'pdfunite':
from subprocess import Popen
p = Popen('pdfunite {} {}'.format(glexpr, args.out), shell=True)