Automatically remove background image.
Migrated from PyPDF4 to pypdf.
This commit is contained in:
parent
2f6d487676
commit
403d8a8025
2 changed files with 38 additions and 14 deletions
14
README.md
14
README.md
|
@ -6,18 +6,18 @@ This script uses your login credentials.
|
||||||
If the script is run multiple times without terminating properly (before logout procedure) the Übungsgruppensystem might block you for some hours.
|
If the script is run multiple times without terminating properly (before logout procedure) the Übungsgruppensystem might block you for some hours.
|
||||||
Use at your own risk.
|
Use at your own risk.
|
||||||
|
|
||||||
|
The script might also modify the slides such that content might go missing.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Requires `playwright` (with `chromium` driver) and `PyPDF4`:
|
Requires `playwright` (with `chromium` driver) and `pypdf`:
|
||||||
```sh
|
```sh
|
||||||
pip install playwright PyPDF4
|
pip install playwright pypdf
|
||||||
playwright install chromium
|
playwright install chromium
|
||||||
```
|
```
|
||||||
Then you just need to download the [`pep5-dl`](https://git.haagfank.de/LnLcFlx/pep5-dl/raw/branch/master/pep5-dl) file from this repository.
|
Then you just need to download the [`pep5-dl`](https://git.haagfank.de/LnLcFlx/pep5-dl/raw/branch/master/pep5-dl) file from this repository.
|
||||||
|
|
||||||
### (Without `PyPDF4`)
|
### (Without `pypdf`)
|
||||||
If you are on Linux and have `pdfunite` installed, you can alternatively supply `--merger=pdfunite` and do not need `PyPDF4`.
|
You can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs without any changes to `<DIR>`.
|
||||||
|
|
||||||
Alternatively you can supply `--merger=none --keep --tmpdir=<DIR>` and simply download the individual PDFs to `<DIR>`.
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
If you simply want to download all current slides into one file `slides.pdf` in the current directory, run
|
If you simply want to download all current slides into one file `slides.pdf` in the current directory, run
|
||||||
|
@ -26,4 +26,6 @@ python pep5-dl '<USERNAME>' '<PASSWORD>'
|
||||||
```
|
```
|
||||||
where your have to replace `<USERNAME>` and `<PASSWORD>` with your credentials.
|
where your have to replace `<USERNAME>` and `<PASSWORD>` with your credentials.
|
||||||
|
|
||||||
|
By default the script removes the background image.
|
||||||
|
If this leads to other missing images supply `--keepbg` to keep the background.
|
||||||
For help and more options run `pep5-dl --help`.
|
For help and more options run `pep5-dl --help`.
|
||||||
|
|
38
pep5-dl
38
pep5-dl
|
@ -9,8 +9,32 @@ from playwright.sync_api import sync_playwright
|
||||||
BASE = 'https://uebungen.physik.uni-heidelberg.de'
|
BASE = 'https://uebungen.physik.uni-heidelberg.de'
|
||||||
LOGIN = BASE+'/uebungen/login.php'
|
LOGIN = BASE+'/uebungen/login.php'
|
||||||
MATERIAL = BASE+'/c/image/d/vorlesung/20232/1735/material/'
|
MATERIAL = BASE+'/c/image/d/vorlesung/20232/1735/material/'
|
||||||
|
|
||||||
DEFAULT = os.path.join(os.getcwd(), 'slides.pdf')
|
DEFAULT = os.path.join(os.getcwd(), 'slides.pdf')
|
||||||
|
BG_IMAGES = ['/Im'+str(i) for i in range(1, 5)]
|
||||||
|
BG_DIMS = [(1499, 1024)]
|
||||||
|
|
||||||
|
|
||||||
|
def pypdf_merge(pdfs, out, removebg=False):
|
||||||
|
from pypdf import PdfWriter
|
||||||
|
merger = PdfWriter()
|
||||||
|
for pdf in pdfs:
|
||||||
|
merger.append(pdf)
|
||||||
|
|
||||||
|
if removebg:
|
||||||
|
print('Removing background...')
|
||||||
|
for pnum, page in enumerate(merger.pages):
|
||||||
|
keys = list(page['/Resources']['/XObject'].keys())
|
||||||
|
for key in keys:
|
||||||
|
if key.startswith('/Im'):
|
||||||
|
o = page['/Resources']['/XObject'][key]
|
||||||
|
if key in BG_IMAGES:
|
||||||
|
del page['/Resources']['/XObject'][key]
|
||||||
|
elif '/Width' in o and '/Height' in o:
|
||||||
|
if (o['/Width'], o['/Height']) in BG_DIMS:
|
||||||
|
del page['/Resources']['/XObject'][key]
|
||||||
|
|
||||||
|
merger.write(out)
|
||||||
|
merger.close()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -22,6 +46,7 @@ def main():
|
||||||
parser.add_argument('-k', '--keep', action='store_true', help='Keep temporary files in TMPDIR')
|
parser.add_argument('-k', '--keep', action='store_true', help='Keep temporary files in TMPDIR')
|
||||||
parser.add_argument('-t', '--tmpdir', type=str, default=tempfile.gettempdir(), help='Temporary directory')
|
parser.add_argument('-t', '--tmpdir', type=str, default=tempfile.gettempdir(), help='Temporary directory')
|
||||||
parser.add_argument('-m', '--merger', type=str, default='pypdf', choices=['pypdf', 'pdfunite', 'none'], help='Method used for merging PDFs')
|
parser.add_argument('-m', '--merger', type=str, default='pypdf', choices=['pypdf', 'pdfunite', 'none'], help='Method used for merging PDFs')
|
||||||
|
parser.add_argument('--keepbg', action='store_true', help='Keep background images of slides (in case of bugs)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.user is None or args.pwd is None:
|
if args.user is None or args.pwd is None:
|
||||||
parser.error('You need to supply a username and password')
|
parser.error('You need to supply a username and password')
|
||||||
|
@ -42,7 +67,7 @@ def main():
|
||||||
i = 1
|
i = 1
|
||||||
while True:
|
while True:
|
||||||
name = 'PEP5_{:02d}.pdf'.format(i)
|
name = 'PEP5_{:02d}.pdf'.format(i)
|
||||||
if not os.path.isfile(name) or args.all:
|
if not os.path.isfile(os.path.join(args.tmpdir, name)) or args.force:
|
||||||
pdf = page.context.request.get(MATERIAL+name)
|
pdf = page.context.request.get(MATERIAL+name)
|
||||||
if pdf.headers['content-type'] == 'application/pdf':
|
if pdf.headers['content-type'] == 'application/pdf':
|
||||||
print('Downloading {}...'.format(name))
|
print('Downloading {}...'.format(name))
|
||||||
|
@ -64,13 +89,10 @@ def main():
|
||||||
gl = glob.glob(glexpr)
|
gl = glob.glob(glexpr)
|
||||||
|
|
||||||
if args.out is not None and args.out != '':
|
if args.out is not None and args.out != '':
|
||||||
|
if args.merger != 'none':
|
||||||
|
print('Merging PDFs...')
|
||||||
if args.merger == 'pypdf':
|
if args.merger == 'pypdf':
|
||||||
from PyPDF4 import PdfMerger
|
pypdf_merge(gl, args.out, not args.keepbg)
|
||||||
merger = PdfMerger()
|
|
||||||
for pdf in gl:
|
|
||||||
merger.append(pdf)
|
|
||||||
merger.write(args.out)
|
|
||||||
merger.close()
|
|
||||||
elif args.merger == 'pdfunite':
|
elif args.merger == 'pdfunite':
|
||||||
from subprocess import Popen
|
from subprocess import Popen
|
||||||
p = Popen('pdfunite {} {}'.format(glexpr, args.out), shell=True)
|
p = Popen('pdfunite {} {}'.format(glexpr, args.out), shell=True)
|
||||||
|
|
Loading…
Add table
Reference in a new issue