update
This commit is contained in:
2
.env
Executable file
2
.env
Executable file
@@ -0,0 +1,2 @@
|
||||
IMAGE_VERSION_PAPERLESS="2.14.7"
|
||||
IMAGE_VERSION_GOTENBERG="8.9.2"
|
||||
7
.gitignore
vendored
Executable file
7
.gitignore
vendored
Executable file
@@ -0,0 +1,7 @@
|
||||
media/
|
||||
data/
|
||||
consume/
|
||||
export/
|
||||
database-dump/
|
||||
*.gz
|
||||
**/__pycache__/**
|
||||
2
scripts/helper/__init__.py
Executable file
2
scripts/helper/__init__.py
Executable file
@@ -0,0 +1,2 @@
|
||||
from .paperless_api import PaperlessAPI
|
||||
from .config import Config
|
||||
29
scripts/helper/config.py
Executable file
29
scripts/helper/config.py
Executable file
@@ -0,0 +1,29 @@
|
||||
import os
|
||||
|
||||
|
||||
class Config():
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._paperless_url = os.environ["PAPERLESS_BASE_URL"]
|
||||
self._auth_token = os.environ["PAPERLESS_POST_CONSUME_AUTH_TOKEN"]
|
||||
self._notify_tag = os.getenv("PAPERLESS_POST_CONSUME_EXTRACTOR_NOTIFY_TAG", "Title changed")
|
||||
self._expand_by_date = (os.getenv("PAPERLESS_POST_CONSUME_EXPAND_BY_DATE", 'True').lower() in ('false', '0', 'f'))
|
||||
|
||||
def get_paperless_url(self):
|
||||
return self._paperless_url
|
||||
|
||||
def get_paperless_api_url(self):
|
||||
return self._paperless_url + "/api"
|
||||
|
||||
def get_auth_token(self):
|
||||
return self._auth_token
|
||||
|
||||
def get_notify_tag(self) -> str:
|
||||
return self._notify_tag
|
||||
|
||||
def expand_title_by_date(self) -> bool:
|
||||
return True #self._expand_by_date
|
||||
|
||||
def __str__(self) -> str:
|
||||
return " ".join([self._paperless_url, self.get_paperless_api_url(), self._auth_token])
|
||||
|
||||
45
scripts/helper/paperless_api.py
Executable file
45
scripts/helper/paperless_api.py
Executable file
@@ -0,0 +1,45 @@
|
||||
# TODO: The user can use anything in the standard library, installed for paperless
|
||||
# or use the custom startup scripts to install additional libraries via pip
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class PaperlessAPI:
|
||||
def __init__(self, api_url, auth_token, timeout=5) -> None:
|
||||
self._base_api_url = api_url
|
||||
self._auth_token = auth_token
|
||||
self._timeout = timeout
|
||||
|
||||
def _get_item_by_id(self, item_type, item_id):
|
||||
if item_id:
|
||||
response = requests.get(f"{self._base_api_url}/{item_type}/{item_id}/",
|
||||
headers = {"Authorization": f"Token {self._auth_token}"})
|
||||
if response.ok:
|
||||
return response.json()
|
||||
|
||||
return {}
|
||||
|
||||
def get_document_by_id(self, document_id):
|
||||
return self._get_item_by_id("documents", document_id)
|
||||
|
||||
def get_tag_id_by_name(self, tag_name):
|
||||
response = requests.get(f"{self._base_api_url}/tags/",
|
||||
headers = {"Authorization": f"Token {self._auth_token}"})
|
||||
if response.ok:
|
||||
print("tags ok")
|
||||
response_json = response.json()
|
||||
tag_list = []
|
||||
tag_list.extend(response_json.get("results"))
|
||||
fetched_tag = [tag for tag in tag_list if tag.get("name") == tag_name]
|
||||
if len(fetched_tag) > 0:
|
||||
print(f'fetched = {fetched_tag}')
|
||||
return fetched_tag[0].get("id")
|
||||
|
||||
print("not found, returning none")
|
||||
return None
|
||||
|
||||
def patch_document(self, document_id, data):
|
||||
return requests.patch(f"{self._base_api_url}/documents/{document_id}/",
|
||||
headers = {"Authorization": f"Token {self._auth_token}"},
|
||||
data = data,
|
||||
timeout = self._timeout)
|
||||
94
scripts/post_consume.py
Executable file
94
scripts/post_consume.py
Executable file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from helper import Config
|
||||
from helper import PaperlessAPI
|
||||
|
||||
|
||||
|
||||
|
||||
# need to have this manually as setlocale() won't work in Paperless docker setup
|
||||
MONTH_NAMES = [
|
||||
"0-Index",
|
||||
"Januar",
|
||||
"Februar",
|
||||
"März",
|
||||
"April",
|
||||
"Mai",
|
||||
"Juni",
|
||||
"Juli",
|
||||
"August",
|
||||
"September",
|
||||
"Oktober",
|
||||
"November",
|
||||
"Dezember"
|
||||
]
|
||||
|
||||
#""
|
||||
# expand the string by the provided parts, but only if the input string doesn't contain those parts yet.
|
||||
#
|
||||
def expand_by_non_existing_parts(title, parts=[]) -> str:
|
||||
if len(parts) == 0:
|
||||
return title
|
||||
|
||||
for part in parts:
|
||||
if str(part) not in title:
|
||||
title += " " + str(part)
|
||||
|
||||
return title
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
|
||||
print("START POST CONSUME")
|
||||
print(cfg)
|
||||
|
||||
api = PaperlessAPI(cfg.get_paperless_api_url(), cfg.get_auth_token())
|
||||
|
||||
document_id = os.environ["DOCUMENT_ID"]
|
||||
document = api.get_document_by_id(document_id)
|
||||
orig_filename = document["original_file_name"]
|
||||
|
||||
title = document["title"]
|
||||
|
||||
pattern = re.compile(r'^(\d{8})(\s\W\s|\s)(.*)')
|
||||
findings = pattern.match(title)
|
||||
if findings:
|
||||
extracted_new_title = findings.group(3)
|
||||
print(f'Extracted title="{extracted_new_title}"')
|
||||
|
||||
# should add Month Year suffix to title?
|
||||
if cfg.expand_title_by_date():
|
||||
print("add suffix")
|
||||
date_string = findings.group(1)
|
||||
print(f"title date string={date_string}")
|
||||
d=datetime.strptime(date_string, "%Y%m%d")
|
||||
|
||||
print(f"parsed date={d}")
|
||||
suffix = f'{MONTH_NAMES[d.month]} {d.year}'
|
||||
print(f"created suffix={suffix}")
|
||||
extracted_new_title = expand_by_non_existing_parts(extracted_new_title, [MONTH_NAMES[d.month], d.year])
|
||||
|
||||
tag_id = api.get_tag_id_by_name(cfg.get_notify_tag())
|
||||
print(tag_id)
|
||||
if tag_id is None:
|
||||
data = {"title": extracted_new_title}
|
||||
else:
|
||||
document["tags"].append(tag_id)
|
||||
data = {"title": extracted_new_title, "tags": document["tags"]}
|
||||
|
||||
response = api.patch_document(document_id, data)
|
||||
|
||||
print(f'Status Code = {response.status_code}')
|
||||
response.raise_for_status()
|
||||
else:
|
||||
print("Nothing changed")
|
||||
|
||||
|
||||
print("END POST CONSUME")
|
||||
|
||||
|
||||
|
||||
6
scripts/pre-consume.sh
Executable file
6
scripts/pre-consume.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -x
|
||||
|
||||
# Remove blank pages
|
||||
/usr/src/paperless/scripts/remove-blank-pages.sh
|
||||
51
scripts/remove-blank-pages.sh
Executable file
51
scripts/remove-blank-pages.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
# https://github.com/paperless-ngx/paperless-ngx/wiki/Pre-Consume-Script-Examples#removing-blank-pages
|
||||
# set -x -e -o pipefail
|
||||
set -e -o pipefail
|
||||
export LC_ALL=C
|
||||
|
||||
#IN="$1"
|
||||
IN="$DOCUMENT_WORKING_PATH"
|
||||
|
||||
# Check for PDF format
|
||||
TYPE=$(file -b "$IN")
|
||||
|
||||
if [ "${TYPE%%,*}" != "PDF document" ]; then
|
||||
>&2 echo "Skipping $IN - non PDF [$TYPE]."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# PDF file - proceed
|
||||
|
||||
#PAGES=$(pdfinfo "$IN" | grep ^Pages: | tr -dc '0-9')
|
||||
PAGES=$(pdfinfo "$IN" | awk '/Pages:/ {print $2}')
|
||||
|
||||
>&2 echo Total pages $PAGES
|
||||
|
||||
|
||||
# Threshold for HP scanners
|
||||
# THRESHOLD=1
|
||||
# Threshold for Lexmar MC2425
|
||||
THRESHOLD=0.8
|
||||
|
||||
|
||||
non_blank() {
|
||||
for i in $(seq 1 $PAGES) ; do
|
||||
PERCENT=$(gs -o - -dFirstPage=${i} -dLastPage=${i} -sDEVICE=ink_cov "${IN}" | grep CMYK | nawk 'BEGIN { sum=0; } {sum += $1 + $2 + $3 + $4;} END { printf "%.5f\n", sum } ')
|
||||
>&2 echo -n "Color-sum in page $i is $PERCENT: "
|
||||
if awk "BEGIN { exit !($PERCENT > $THRESHOLD) }"; then
|
||||
echo $i
|
||||
>&2 echo "Page added to document"
|
||||
else
|
||||
>&2 echo "Page removed from document"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
NON_BLANK=$(non_blank)
|
||||
|
||||
if [ -n "$NON_BLANK" ]; then
|
||||
NON_BLANK=$(echo $NON_BLANK | tr ' ' ",")
|
||||
qpdf "$IN" --warning-exit-0 --replace-input --pages . $NON_BLANK --
|
||||
fi
|
||||
59
scripts/tags.json
Executable file
59
scripts/tags.json
Executable file
@@ -0,0 +1,59 @@
|
||||
{'count': 17,
|
||||
'next': None,
|
||||
'previous': None,
|
||||
'all': [
|
||||
12,
|
||||
9,
|
||||
1,
|
||||
5,
|
||||
11,
|
||||
2,
|
||||
8,
|
||||
14,
|
||||
6,
|
||||
3,
|
||||
15,
|
||||
7,
|
||||
17,
|
||||
16,
|
||||
13,
|
||||
4,
|
||||
10
|
||||
],
|
||||
'results': [
|
||||
{'id': 12, 'slug': 'buro', 'name': 'Büro', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 9, 'slug': 'elektronik', 'name': 'Elektronik', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 1, 'slug': 'inbox', 'name': 'Inbox', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': True, 'document_count': 2, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 5, 'slug': 'internet', 'name': 'Internet', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 11, 'slug': 'konto', 'name': 'Konto', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 2, 'slug': 'kredit', 'name': 'Kredit', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 8, 'slug': 'kreditkarte', 'name': 'Kreditkarte', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 3, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 14, 'slug': 'lizenzschlussel', 'name': 'Lizenzschlüssel', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
|
||||
},
|
||||
{'id': 6, 'slug': 'mobilfunk', 'name': 'Mobilfunk', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 4, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 3, 'slug': 'ruckschein', 'name': 'Rückschein', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 15, 'slug': 'software', 'name': 'Software', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
|
||||
},
|
||||
{'id': 7, 'slug': 'steuern', 'name': 'Steuern', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 2, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 17, 'slug': 'title-changed', 'name': 'Title changed', 'colour': 1, 'match': '', 'matching_algorithm': 0, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 4, 'user_can_change': True
|
||||
},
|
||||
{'id': 16, 'slug': 'todo', 'name': 'TODO', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 1, 'owner': 4, 'user_can_change': True
|
||||
},
|
||||
{'id': 13, 'slug': 'versicherungen', 'name': 'Versicherungen', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 8, 'owner': 4, 'user_can_change': True
|
||||
},
|
||||
{'id': 4, 'slug': 'versorger-gas', 'name': 'Versorger: Gas', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
|
||||
},
|
||||
{'id': 10, 'slug': 'versorger-strom', 'name': 'Versorger: Strom', 'colour': 1, 'match': '', 'matching_algorithm': 6, 'is_insensitive': True, 'is_inbox_tag': False, 'document_count': 0, 'owner': 3, 'user_can_change': True
|
||||
}
|
||||
]
|
||||
}
|
||||
6
scripts/test.http
Executable file
6
scripts/test.http
Executable file
@@ -0,0 +1,6 @@
|
||||
|
||||
|
||||
GET http://muckibude.fritz.box:8010/api/tags HTTP/1.1
|
||||
Authorization: token 0cf8eb062d0ecfc0aa70611125427692cb577d68
|
||||
|
||||
####
|
||||
1
scripts/test_post_consume.py
Executable file
1
scripts/test_post_consume.py
Executable file
@@ -0,0 +1 @@
|
||||
def test_
|
||||
Reference in New Issue
Block a user