This commit is contained in:
Bastian Schnorbus
2025-05-04 16:01:16 +02:00
commit 5941117b86
11 changed files with 302 additions and 0 deletions

2
scripts/helper/__init__.py Executable file
View File

@@ -0,0 +1,2 @@
from .paperless_api import PaperlessAPI
from .config import Config

29
scripts/helper/config.py Executable file
View File

@@ -0,0 +1,29 @@
import os
class Config():
def __init__(self) -> None:
self._paperless_url = os.environ["PAPERLESS_BASE_URL"]
self._auth_token = os.environ["PAPERLESS_POST_CONSUME_AUTH_TOKEN"]
self._notify_tag = os.getenv("PAPERLESS_POST_CONSUME_EXTRACTOR_NOTIFY_TAG", "Title changed")
self._expand_by_date = (os.getenv("PAPERLESS_POST_CONSUME_EXPAND_BY_DATE", 'True').lower() in ('false', '0', 'f'))
def get_paperless_url(self):
return self._paperless_url
def get_paperless_api_url(self):
return self._paperless_url + "/api"
def get_auth_token(self):
return self._auth_token
def get_notify_tag(self) -> str:
return self._notify_tag
def expand_title_by_date(self) -> bool:
return True #self._expand_by_date
def __str__(self) -> str:
return " ".join([self._paperless_url, self.get_paperless_api_url(), self._auth_token])

45
scripts/helper/paperless_api.py Executable file
View File

@@ -0,0 +1,45 @@
# TODO: The user can use anything in the standard library, installed for paperless
# or use the custom startup scripts to install additional libraries via pip
import requests
class PaperlessAPI:
def __init__(self, api_url, auth_token, timeout=5) -> None:
self._base_api_url = api_url
self._auth_token = auth_token
self._timeout = timeout
def _get_item_by_id(self, item_type, item_id):
if item_id:
response = requests.get(f"{self._base_api_url}/{item_type}/{item_id}/",
headers = {"Authorization": f"Token {self._auth_token}"})
if response.ok:
return response.json()
return {}
def get_document_by_id(self, document_id):
return self._get_item_by_id("documents", document_id)
def get_tag_id_by_name(self, tag_name):
response = requests.get(f"{self._base_api_url}/tags/",
headers = {"Authorization": f"Token {self._auth_token}"})
if response.ok:
print("tags ok")
response_json = response.json()
tag_list = []
tag_list.extend(response_json.get("results"))
fetched_tag = [tag for tag in tag_list if tag.get("name") == tag_name]
if len(fetched_tag) > 0:
print(f'fetched = {fetched_tag}')
return fetched_tag[0].get("id")
print("not found, returning none")
return None
def patch_document(self, document_id, data):
return requests.patch(f"{self._base_api_url}/documents/{document_id}/",
headers = {"Authorization": f"Token {self._auth_token}"},
data = data,
timeout = self._timeout)

94
scripts/post_consume.py Executable file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
import os
import re
from datetime import datetime
from helper import Config
from helper import PaperlessAPI
# need to have this manually as setlocale() won't work in Paperless docker setup
MONTH_NAMES = [
"0-Index",
"Januar",
"Februar",
"März",
"April",
"Mai",
"Juni",
"Juli",
"August",
"September",
"Oktober",
"November",
"Dezember"
]
#""
# expand the string by the provided parts, but only if the input string doesn't contain those parts yet.
#
def expand_by_non_existing_parts(title, parts=[]) -> str:
if len(parts) == 0:
return title
for part in parts:
if str(part) not in title:
title += " " + str(part)
return title
if __name__ == "__main__":
cfg = Config()
print("START POST CONSUME")
print(cfg)
api = PaperlessAPI(cfg.get_paperless_api_url(), cfg.get_auth_token())
document_id = os.environ["DOCUMENT_ID"]
document = api.get_document_by_id(document_id)
orig_filename = document["original_file_name"]
title = document["title"]
pattern = re.compile(r'^(\d{8})(\s\W\s|\s)(.*)')
findings = pattern.match(title)
if findings:
extracted_new_title = findings.group(3)
print(f'Extracted title="{extracted_new_title}"')
# should add Month Year suffix to title?
if cfg.expand_title_by_date():
print("add suffix")
date_string = findings.group(1)
print(f"title date string={date_string}")
d=datetime.strptime(date_string, "%Y%m%d")
print(f"parsed date={d}")
suffix = f'{MONTH_NAMES[d.month]} {d.year}'
print(f"created suffix={suffix}")
extracted_new_title = expand_by_non_existing_parts(extracted_new_title, [MONTH_NAMES[d.month], d.year])
tag_id = api.get_tag_id_by_name(cfg.get_notify_tag())
print(tag_id)
if tag_id is None:
data = {"title": extracted_new_title}
else:
document["tags"].append(tag_id)
data = {"title": extracted_new_title, "tags": document["tags"]}
response = api.patch_document(document_id, data)
print(f'Status Code = {response.status_code}')
response.raise_for_status()
else:
print("Nothing changed")
print("END POST CONSUME")

6
scripts/pre-consume.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/usr/bin/env bash
set -x
# Remove blank pages
/usr/src/paperless/scripts/remove-blank-pages.sh

51
scripts/remove-blank-pages.sh Executable file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# https://github.com/paperless-ngx/paperless-ngx/wiki/Pre-Consume-Script-Examples#removing-blank-pages
# set -x -e -o pipefail
set -e -o pipefail
export LC_ALL=C
#IN="$1"
IN="$DOCUMENT_WORKING_PATH"
# Check for PDF format
TYPE=$(file -b "$IN")
if [ "${TYPE%%,*}" != "PDF document" ]; then
>&2 echo "Skipping $IN - non PDF [$TYPE]."
exit 0
fi
# PDF file - proceed
#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9')
PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}')
>&2 echo Total pages $PAGES
# Threshold for HP scanners
# THRESHOLD=1
# Threshold for Lexmar MC2425
THRESHOLD=0.8
non_blank() {
for i in $(seq 1 $PAGES) ; do
PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ')
>&2 echo -n "Color-sum in page $i is $PERCENT: "
if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then
echo $i
>&2 echo "Page added to document"
else
>&2 echo "Page removed from document"
fi
done
}
NON_BLANK=$(non_blank)
if [ -n "$NON_BLANK" ]; then
NON_BLANK=$(echo $NON_BLANK | tr ' ' ",")
qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK --
fi

59
scripts/tags.json Executable file
View File

@@ -0,0 +1,59 @@
{'count': 17,
'next': None,
'previous': None,
'all': [
12,
9,
1,
5,
11,
2,
8,
14,
6,
3,
15,
7,
17,
16,
13,
4,
10
],
'results': [
{'id': 12, 'slug': 'buro', 'name': 'Büro', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
},
{'id': 9, 'slug': 'elektronik', 'name': 'Elektronik', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
},
{'id': 1, 'slug': 'inbox', 'name': 'Inbox', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': True, 'document_count': 2, 'owner': 3, 'user_can_change': True
},
{'id': 5, 'slug': 'internet', 'name': 'Internet', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
},
{'id': 11, 'slug': 'konto', 'name': 'Konto', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
},
{'id': 2, 'slug': 'kredit', 'name': 'Kredit', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
},
{'id': 8, 'slug': 'kreditkarte', 'name': 'Kreditkarte', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
},
{'id': 14, 'slug': 'lizenzschlussel', 'name': 'Lizenzschlüssel', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
},
{'id': 6, 'slug': 'mobilfunk', 'name': 'Mobilfunk', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
},
{'id': 3, 'slug': 'ruckschein', 'name': 'Rückschein', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
},
{'id': 15, 'slug': 'software', 'name': 'Software', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
},
{'id': 7, 'slug': 'steuern', 'name': 'Steuern', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 2, 'owner': 3, 'user_can_change': True
},
{'id': 17, 'slug': 'title-changed', 'name': 'Title changed', 'colour': 1, 'match': '', 'matching_algorithm': 0, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
},
{'id': 16, 'slug': 'todo', 'name': 'TODO', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
},
{'id': 13, 'slug': 'versicherungen', 'name': 'Versicherungen', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 8, 'owner': 4, 'user_can_change': True
},
{'id': 4, 'slug': 'versorger-gas', 'name': 'Versorger: Gas', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
},
{'id': 10, 'slug': 'versorger-strom', 'name': 'Versorger: Strom', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
}
]
}

6
scripts/test.http Executable file
View File

@@ -0,0 +1,6 @@
GET http://muckibude.fritz.box:8010/api/tags HTTP/1.1
Authorization: token 0cf8eb062d0ecfc0aa70611125427692cb577d68
####

1
scripts/test_post_consume.py Executable file
View File

@@ -0,0 +1 @@
def test_