commit 5941117b86df758fd366d86746d5faabc2a2919b Author: Bastian Schnorbus Date: Sun May 4 16:01:16 2025 +0200 update diff --git a/.env b/.env new file mode 100755 index 0000000..69e291c --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +IMAGE_VERSION_PAPERLESS="2.14.7" +IMAGE_VERSION_GOTENBERG="8.9.2" diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..d6dde53 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +media/ +data/ +consume/ +export/ +database-dump/ +*.gz +**/__pycache__/** diff --git a/scripts/helper/__init__.py b/scripts/helper/__init__.py new file mode 100755 index 0000000..719c1e0 --- /dev/null +++ b/scripts/helper/__init__.py @@ -0,0 +1,2 @@ +from .paperless_api import PaperlessAPI +from .config import Config diff --git a/scripts/helper/config.py b/scripts/helper/config.py new file mode 100755 index 0000000..3cece67 --- /dev/null +++ b/scripts/helper/config.py @@ -0,0 +1,29 @@ +import os + + +class Config(): + + def __init__(self) -> None: + self._paperless_url = os.environ["PAPERLESS_BASE_URL"] + self._auth_token = os.environ["PAPERLESS_POST_CONSUME_AUTH_TOKEN"] + self._notify_tag = os.getenv("PAPERLESS_POST_CONSUME_EXTRACTOR_NOTIFY_TAG", "Title changed") + self._expand_by_date = (os.getenv("PAPERLESS_POST_CONSUME_EXPAND_BY_DATE", 'True').lower() in ('false', '0', 'f')) + + def get_paperless_url(self): + return self._paperless_url + + def get_paperless_api_url(self): + return self._paperless_url + "/api" + + def get_auth_token(self): + return self._auth_token + + def get_notify_tag(self) -> str: + return self._notify_tag + + def expand_title_by_date(self) -> bool: + return True #self._expand_by_date + + def __str__(self) -> str: + return " ".join([self._paperless_url, self.get_paperless_api_url(), self._auth_token]) + \ No newline at end of file diff --git a/scripts/helper/paperless_api.py b/scripts/helper/paperless_api.py new file mode 100755 index 0000000..5a3a350 --- /dev/null +++ b/scripts/helper/paperless_api.py @@ -0,0 +1,45 @@ +# TODO: The user can use anything in the standard library, installed for paperless +# or use the custom startup scripts to install additional libraries via pip + +import requests + + +class PaperlessAPI: + def __init__(self, api_url, auth_token, timeout=5) -> None: + self._base_api_url = api_url + self._auth_token = auth_token + self._timeout = timeout + + def _get_item_by_id(self, item_type, item_id): + if item_id: + response = requests.get(f"{self._base_api_url}/{item_type}/{item_id}/", + headers = {"Authorization": f"Token {self._auth_token}"}) + if response.ok: + return response.json() + + return {} + + def get_document_by_id(self, document_id): + return self._get_item_by_id("documents", document_id) + + def get_tag_id_by_name(self, tag_name): + response = requests.get(f"{self._base_api_url}/tags/", + headers = {"Authorization": f"Token {self._auth_token}"}) + if response.ok: + print("tags ok") + response_json = response.json() + tag_list = [] + tag_list.extend(response_json.get("results")) + fetched_tag = [tag for tag in tag_list if tag.get("name") == tag_name] + if len(fetched_tag) > 0: + print(f'fetched = {fetched_tag}') + return fetched_tag[0].get("id") + + print("not found, returning none") + return None + + def patch_document(self, document_id, data): + return requests.patch(f"{self._base_api_url}/documents/{document_id}/", + headers = {"Authorization": f"Token {self._auth_token}"}, + data = data, + timeout = self._timeout) diff --git a/scripts/post_consume.py b/scripts/post_consume.py new file mode 100755 index 0000000..2272d7b --- /dev/null +++ b/scripts/post_consume.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import os +import re +from datetime import datetime +from helper import Config +from helper import PaperlessAPI + + + + +# need to have this manually as setlocale() won't work in Paperless docker setup +MONTH_NAMES = [ + "0-Index", + "Januar", + "Februar", + "März", + "April", + "Mai", + "Juni", + "Juli", + "August", + "September", + "Oktober", + "November", + "Dezember" +] + +#"" +# expand the string by the provided parts, but only if the input string doesn't contain those parts yet. +# +def expand_by_non_existing_parts(title, parts=[]) -> str: + if len(parts) == 0: + return title + + for part in parts: + if str(part) not in title: + title += " " + str(part) + + return title + + +if __name__ == "__main__": + cfg = Config() + + print("START POST CONSUME") + print(cfg) + + api = PaperlessAPI(cfg.get_paperless_api_url(), cfg.get_auth_token()) + + document_id = os.environ["DOCUMENT_ID"] + document = api.get_document_by_id(document_id) + orig_filename = document["original_file_name"] + + title = document["title"] + + pattern = re.compile(r'^(\d{8})(\s\W\s|\s)(.*)') + findings = pattern.match(title) + if findings: + extracted_new_title = findings.group(3) + print(f'Extracted title="{extracted_new_title}"') + + # should add Month Year suffix to title? + if cfg.expand_title_by_date(): + print("add suffix") + date_string = findings.group(1) + print(f"title date string={date_string}") + d=datetime.strptime(date_string, "%Y%m%d") + + print(f"parsed date={d}") + suffix = f'{MONTH_NAMES[d.month]} {d.year}' + print(f"created suffix={suffix}") + extracted_new_title = expand_by_non_existing_parts(extracted_new_title, [MONTH_NAMES[d.month], d.year]) + + tag_id = api.get_tag_id_by_name(cfg.get_notify_tag()) + print(tag_id) + if tag_id is None: + data = {"title": extracted_new_title} + else: + document["tags"].append(tag_id) + data = {"title": extracted_new_title, "tags": document["tags"]} + + response = api.patch_document(document_id, data) + + print(f'Status Code = {response.status_code}') + response.raise_for_status() + else: + print("Nothing changed") + + + print("END POST CONSUME") + + + diff --git a/scripts/pre-consume.sh b/scripts/pre-consume.sh new file mode 100755 index 0000000..b134951 --- /dev/null +++ b/scripts/pre-consume.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +set -x + +# Remove blank pages +/usr/src/paperless/scripts/remove-blank-pages.sh diff --git a/scripts/remove-blank-pages.sh b/scripts/remove-blank-pages.sh new file mode 100755 index 0000000..d142869 --- /dev/null +++ b/scripts/remove-blank-pages.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# https://github.com/paperless-ngx/paperless-ngx/wiki/Pre-Consume-Script-Examples#removing-blank-pages +# set -x -e -o pipefail +set -e -o pipefail +export LC_ALL=C + +#IN="$1" +IN="$DOCUMENT_WORKING_PATH" + +# Check for PDF format +TYPE=$(file -b "$IN") + +if [ "${TYPE%%,*}" != "PDF document" ]; then + >&2 echo "Skipping $IN - non PDF [$TYPE]." + exit 0 +fi + +# PDF file - proceed + +#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9') +PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}') + +>&2 echo Total pages $PAGES + + +# Threshold for HP scanners +# THRESHOLD=1 +# Threshold for Lexmar MC2425 +THRESHOLD=0.8 + + +non_blank() { + for i in $(seq 1 $PAGES) ; do + PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ') + >&2 echo -n "Color-sum in page $i is $PERCENT: " + if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then + echo $i + >&2 echo "Page added to document" + else + >&2 echo "Page removed from document" + fi + done +} + +NON_BLANK=$(non_blank) + +if [ -n "$NON_BLANK" ]; then + NON_BLANK=$(echo $NON_BLANK | tr ' ' ",") + qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK -- +fi diff --git a/scripts/tags.json b/scripts/tags.json new file mode 100755 index 0000000..9a82411 --- /dev/null +++ b/scripts/tags.json @@ -0,0 +1,59 @@ +{'count': 17, +'next': None, +'previous': None, +'all': [ + 12, + 9, + 1, + 5, + 11, + 2, + 8, + 14, + 6, + 3, + 15, + 7, + 17, + 16, + 13, + 4, + 10 + ], + 'results': [ + {'id': 12, 'slug': 'buro', 'name': 'Büro', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True + }, + {'id': 9, 'slug': 'elektronik', 'name': 'Elektronik', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True + }, + {'id': 1, 'slug': 'inbox', 'name': 'Inbox', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': True, 'document_count': 2, 'owner': 3, 'user_can_change': True + }, + {'id': 5, 'slug': 'internet', 'name': 'Internet', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True + }, + {'id': 11, 'slug': 'konto', 'name': 'Konto', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True + }, + {'id': 2, 'slug': 'kredit', 'name': 'Kredit', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True + }, + {'id': 8, 'slug': 'kreditkarte', 'name': 'Kreditkarte', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True + }, + {'id': 14, 'slug': 'lizenzschlussel', 'name': 'Lizenzschlüssel', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True + }, + {'id': 6, 'slug': 'mobilfunk', 'name': 'Mobilfunk', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True + }, + {'id': 3, 'slug': 'ruckschein', 'name': 'Rückschein', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True + }, + {'id': 15, 'slug': 'software', 'name': 'Software', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True + }, + {'id': 7, 'slug': 'steuern', 'name': 'Steuern', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 2, 'owner': 3, 'user_can_change': True + }, + {'id': 17, 'slug': 'title-changed', 'name': 'Title changed', 'colour': 1, 'match': '', 'matching_algorithm': 0, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True + }, + {'id': 16, 'slug': 'todo', 'name': 'TODO', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True + }, + {'id': 13, 'slug': 'versicherungen', 'name': 'Versicherungen', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 8, 'owner': 4, 'user_can_change': True + }, + {'id': 4, 'slug': 'versorger-gas', 'name': 'Versorger: Gas', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True + }, + {'id': 10, 'slug': 'versorger-strom', 'name': 'Versorger: Strom', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True + } + ] +} \ No newline at end of file diff --git a/scripts/test.http b/scripts/test.http new file mode 100755 index 0000000..be7f23b --- /dev/null +++ b/scripts/test.http @@ -0,0 +1,6 @@ + + +GET http://muckibude.fritz.box:8010/api/tags HTTP/1.1 +Authorization: token 0cf8eb062d0ecfc0aa70611125427692cb577d68 + +#### diff --git a/scripts/test_post_consume.py b/scripts/test_post_consume.py new file mode 100755 index 0000000..956f03a --- /dev/null +++ b/scripts/test_post_consume.py @@ -0,0 +1 @@ +def test_ \ No newline at end of file