commit 5941117b86df758fd366d86746d5faabc2a2919b
Author: Bastian Schnorbus <bastian.schnorbus@gmail.com>
Date:   Sun May 4 16:01:16 2025 +0200

    update

diff --git a/.env b/.env
new file mode 100755
index 0000000..69e291c
--- /dev/null
+++ b/.env
@@ -0,0 +1,2 @@
+IMAGE_VERSION_PAPERLESS="2.14.7"
+IMAGE_VERSION_GOTENBERG="8.9.2"
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..d6dde53
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+media/
+data/
+consume/
+export/
+database-dump/
+*.gz
+**/__pycache__/**
diff --git a/scripts/helper/__init__.py b/scripts/helper/__init__.py
new file mode 100755
index 0000000..719c1e0
--- /dev/null
+++ b/scripts/helper/__init__.py
@@ -0,0 +1,2 @@
+from .paperless_api import PaperlessAPI
+from .config import Config
diff --git a/scripts/helper/config.py b/scripts/helper/config.py
new file mode 100755
index 0000000..3cece67
--- /dev/null
+++ b/scripts/helper/config.py
@@ -0,0 +1,29 @@
+import os
+
+
+class Config():
+
+    def __init__(self) -> None:
+        self._paperless_url = os.environ["PAPERLESS_BASE_URL"]
+        self._auth_token = os.environ["PAPERLESS_POST_CONSUME_AUTH_TOKEN"]
+        self._notify_tag = os.getenv("PAPERLESS_POST_CONSUME_EXTRACTOR_NOTIFY_TAG", "Title changed")
+        self._expand_by_date = (os.getenv("PAPERLESS_POST_CONSUME_EXPAND_BY_DATE", 'True').lower() in ('false', '0', 'f'))
+
+    def get_paperless_url(self):
+        return self._paperless_url
+    
+    def get_paperless_api_url(self):
+        return self._paperless_url + "/api"
+    
+    def get_auth_token(self):
+        return self._auth_token
+    
+    def get_notify_tag(self) -> str:
+        return self._notify_tag
+    
+    def expand_title_by_date(self) -> bool:
+        return True #self._expand_by_date
+    
+    def __str__(self) -> str:
+        return " ".join([self._paperless_url, self.get_paperless_api_url(), self._auth_token])
+        
\ No newline at end of file
diff --git a/scripts/helper/paperless_api.py b/scripts/helper/paperless_api.py
new file mode 100755
index 0000000..5a3a350
--- /dev/null
+++ b/scripts/helper/paperless_api.py
@@ -0,0 +1,45 @@
+# TODO: The user can use anything in the standard library, installed for paperless
+# or use the custom startup scripts to install additional libraries via pip
+
+import requests
+
+
+class PaperlessAPI:
+    def __init__(self, api_url, auth_token, timeout=5) -> None:
+        self._base_api_url = api_url
+        self._auth_token = auth_token
+        self._timeout = timeout
+
+    def _get_item_by_id(self, item_type, item_id):
+        if item_id:
+            response = requests.get(f"{self._base_api_url}/{item_type}/{item_id}/",
+                                    headers = {"Authorization": f"Token {self._auth_token}"})
+        if response.ok:
+            return response.json()
+
+        return {}  
+
+    def get_document_by_id(self, document_id):
+        return self._get_item_by_id("documents", document_id)
+
+    def get_tag_id_by_name(self, tag_name):
+        response = requests.get(f"{self._base_api_url}/tags/",
+                            headers = {"Authorization": f"Token {self._auth_token}"})
+        if response.ok:
+            print("tags ok")
+            response_json = response.json()
+            tag_list = []
+            tag_list.extend(response_json.get("results"))
+            fetched_tag = [tag for tag in tag_list if tag.get("name") == tag_name]
+            if len(fetched_tag) > 0:
+                print(f'fetched = {fetched_tag}')
+                return fetched_tag[0].get("id")
+
+            print("not found, returning none")
+        return None
+
+    def patch_document(self, document_id, data):
+        return requests.patch(f"{self._base_api_url}/documents/{document_id}/",
+                            headers = {"Authorization": f"Token {self._auth_token}"},
+                            data = data,
+                            timeout = self._timeout)
diff --git a/scripts/post_consume.py b/scripts/post_consume.py
new file mode 100755
index 0000000..2272d7b
--- /dev/null
+++ b/scripts/post_consume.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import os
+import re
+from datetime import datetime
+from helper import Config
+from helper import PaperlessAPI
+
+
+
+
+# need to have this manually as setlocale() won't work in Paperless docker setup
+MONTH_NAMES = [
+    "0-Index",
+    "Januar",
+    "Februar",
+    "März",
+    "April",
+    "Mai",
+    "Juni",
+    "Juli",
+    "August",
+    "September",
+    "Oktober",
+    "November",
+    "Dezember"
+]
+
+#""
+# expand the string by the provided parts, but only if the input string doesn't contain those parts yet.
+#
+def expand_by_non_existing_parts(title, parts=[]) -> str:
+    if len(parts) == 0:
+        return title
+    
+    for part in parts:
+        if str(part) not in title:
+            title += " " + str(part)
+
+    return title
+
+
+if __name__ == "__main__":
+    cfg = Config()
+
+    print("START POST CONSUME")
+    print(cfg)
+
+    api = PaperlessAPI(cfg.get_paperless_api_url(), cfg.get_auth_token())
+
+    document_id = os.environ["DOCUMENT_ID"]
+    document = api.get_document_by_id(document_id)
+    orig_filename = document["original_file_name"]
+
+    title = document["title"]
+
+    pattern = re.compile(r'^(\d{8})(\s\W\s|\s)(.*)')
+    findings = pattern.match(title)    
+    if findings:
+        extracted_new_title = findings.group(3)
+        print(f'Extracted title="{extracted_new_title}"')
+        
+        # should add Month Year suffix to title?
+        if cfg.expand_title_by_date():
+            print("add suffix")
+            date_string = findings.group(1)
+            print(f"title date string={date_string}")
+            d=datetime.strptime(date_string, "%Y%m%d")
+
+            print(f"parsed date={d}")
+            suffix = f'{MONTH_NAMES[d.month]} {d.year}'
+            print(f"created suffix={suffix}")
+            extracted_new_title = expand_by_non_existing_parts(extracted_new_title, [MONTH_NAMES[d.month], d.year])
+
+        tag_id = api.get_tag_id_by_name(cfg.get_notify_tag())
+        print(tag_id)
+        if tag_id is None:
+            data = {"title": extracted_new_title}
+        else:
+            document["tags"].append(tag_id)
+            data = {"title": extracted_new_title, "tags": document["tags"]}
+
+        response = api.patch_document(document_id, data)
+
+        print(f'Status Code = {response.status_code}')
+        response.raise_for_status()
+    else:
+        print("Nothing changed")
+
+    
+    print("END POST CONSUME")
+
+
+
diff --git a/scripts/pre-consume.sh b/scripts/pre-consume.sh
new file mode 100755
index 0000000..b134951
--- /dev/null
+++ b/scripts/pre-consume.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -x
+
+# Remove blank pages
+/usr/src/paperless/scripts/remove-blank-pages.sh
diff --git a/scripts/remove-blank-pages.sh b/scripts/remove-blank-pages.sh
new file mode 100755
index 0000000..d142869
--- /dev/null
+++ b/scripts/remove-blank-pages.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# https://github.com/paperless-ngx/paperless-ngx/wiki/Pre-Consume-Script-Examples#removing-blank-pages
+# set -x -e -o pipefail
+set -e -o pipefail
+export LC_ALL=C
+
+#IN="$1"
+IN="$DOCUMENT_WORKING_PATH"
+
+# Check for PDF format
+TYPE=$(file -b "$IN")
+
+if [ "${TYPE%%,*}" != "PDF document" ]; then
+  >&2 echo "Skipping $IN - non PDF [$TYPE]."
+  exit 0
+fi
+
+# PDF file - proceed
+
+#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9')
+PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}')
+
+>&2 echo Total pages $PAGES
+
+
+# Threshold for HP scanners
+# THRESHOLD=1
+# Threshold for Lexmar MC2425
+THRESHOLD=0.8
+
+
+non_blank() {
+  for i in $(seq 1 $PAGES) ; do
+    PERCENT=$(gs -o -  -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END {  printf "%.5f\n", sum } ')
+    >&2 echo -n "Color-sum in page $i is $PERCENT: "
+    if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then
+      echo $i
+      >&2 echo "Page added to document"
+    else
+      >&2 echo "Page removed from document"
+    fi
+  done
+}
+
+NON_BLANK=$(non_blank)
+
+if [ -n "$NON_BLANK" ]; then
+  NON_BLANK=$(echo $NON_BLANK  | tr ' ' ",")
+  qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK --
+fi
diff --git a/scripts/tags.json b/scripts/tags.json
new file mode 100755
index 0000000..9a82411
--- /dev/null
+++ b/scripts/tags.json
@@ -0,0 +1,59 @@
+{'count': 17, 
+'next': None, 
+'previous': None, 
+'all': [
+        12,
+        9,
+        1,
+        5,
+        11,
+        2,
+        8,
+        14,
+        6,
+        3,
+        15,
+        7,
+        17,
+        16,
+        13,
+        4,
+        10
+    ], 
+    'results': [
+        {'id': 12, 'slug': 'buro', 'name': 'Büro', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 9, 'slug': 'elektronik', 'name': 'Elektronik', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 1, 'slug': 'inbox', 'name': 'Inbox', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': True, 'document_count': 2, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 5, 'slug': 'internet', 'name': 'Internet', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 11, 'slug': 'konto', 'name': 'Konto', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 2, 'slug': 'kredit', 'name': 'Kredit', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 8, 'slug': 'kreditkarte', 'name': 'Kreditkarte', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 14, 'slug': 'lizenzschlussel', 'name': 'Lizenzschlüssel', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
+        },
+        {'id': 6, 'slug': 'mobilfunk', 'name': 'Mobilfunk', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 3, 'slug': 'ruckschein', 'name': 'Rückschein', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 15, 'slug': 'software', 'name': 'Software', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
+        },
+        {'id': 7, 'slug': 'steuern', 'name': 'Steuern', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 2, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 17, 'slug': 'title-changed', 'name': 'Title changed', 'colour': 1, 'match': '', 'matching_algorithm': 0, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
+        },
+        {'id': 16, 'slug': 'todo', 'name': 'TODO', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
+        },
+        {'id': 13, 'slug': 'versicherungen', 'name': 'Versicherungen', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 8, 'owner': 4, 'user_can_change': True
+        },
+        {'id': 4, 'slug': 'versorger-gas', 'name': 'Versorger: Gas', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
+        },
+        {'id': 10, 'slug': 'versorger-strom', 'name': 'Versorger: Strom', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
+        }
+    ]
+}
\ No newline at end of file
diff --git a/scripts/test.http b/scripts/test.http
new file mode 100755
index 0000000..be7f23b
--- /dev/null
+++ b/scripts/test.http
@@ -0,0 +1,6 @@
+
+
+GET http://muckibude.fritz.box:8010/api/tags HTTP/1.1
+Authorization: token 0cf8eb062d0ecfc0aa70611125427692cb577d68
+
+####
diff --git a/scripts/test_post_consume.py b/scripts/test_post_consume.py
new file mode 100755
index 0000000..956f03a
--- /dev/null
+++ b/scripts/test_post_consume.py
@@ -0,0 +1 @@
+def test_
\ No newline at end of file