Skip to content

Commit 4694bb0

Browse files
authored
Merge pull request #12 from openaleph/chore/ftm4
v3.24.0
2 parents 98df27d + 55595ac commit 4694bb0

File tree

12 files changed

+597
-744
lines changed

12 files changed

+597
-744
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
data/model_type_prediction.ftz
2+
data/servicelayer-archive
23
# Byte-compiled / optimized / DLL files
34
__pycache__/
45
*.py[cod]

ingestors/cli.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
1-
import sys
2-
import click
31
import logging
2+
import sys
43
from pprint import pprint
5-
from ftmstore import get_dataset
6-
from servicelayer.cache import get_redis, get_fakeredis
7-
from servicelayer.logs import configure_logging
8-
from servicelayer.jobs import Job, Dataset
4+
5+
import click
6+
from ftmq.store.fragments import get_dataset
97
from servicelayer import settings as sl_settings
108
from servicelayer.archive.util import ensure_path
9+
from servicelayer.cache import get_fakeredis, get_redis
10+
from servicelayer.jobs import Dataset, Job
11+
from servicelayer.logs import configure_logging
1112
from servicelayer.tags import Tags
1213

1314
from ingestors import settings
14-
from ingestors.manager import Manager
15-
from ingestors.directory import DirectoryIngestor
1615
from ingestors.analysis import Analyzer
17-
from ingestors.worker import IngestWorker, OP_ANALYZE, OP_INGEST
16+
from ingestors.directory import DirectoryIngestor
17+
from ingestors.manager import Manager
18+
from ingestors.worker import OP_ANALYZE, OP_INGEST, IngestWorker
1819

1920
log = logging.getLogger(__name__)
2021
STAGES = [OP_ANALYZE, OP_INGEST]
@@ -50,7 +51,8 @@ def killthekitten():
5051
conn.flushall()
5152

5253

53-
def _ingest_path(db, conn, dataset, path, languages=[]):
54+
def _ingest_path(db, conn, dataset, path, languages=None):
55+
languages = languages or []
5456
context = {"languages": languages}
5557
job = Job.create(conn, dataset)
5658
stage = job.get_stage(OP_INGEST)

ingestors/email/vcard.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import logging
2+
23
import vobject
3-
from vobject.base import ParseError
44
from banal import ensure_list
55
from followthemoney import model
66
from followthemoney.util import sanitize_text
7+
from vobject.base import Component, ContentLine, ParseError
78

9+
from ingestors.exc import ProcessingException
810
from ingestors.ingestor import Ingestor
911
from ingestors.support.encoding import EncodingSupport
10-
from ingestors.exc import ProcessingException
1112

1213
log = logging.getLogger(__name__)
1314

@@ -17,9 +18,9 @@ class VCardIngestor(Ingestor, EncodingSupport):
1718
EXTENSIONS = ["vcf", "vcard"]
1819
SCORE = 10
1920

20-
def get_field(self, card, field):
21-
items = ensure_list(card.contents.get(field))
22-
return [i.value for i in items]
21+
def get_field(self, card: Component, field: str):
22+
items: list[ContentLine] = ensure_list(card.contents.get(field))
23+
return [str(i.value) for i in items]
2324

2425
def ingest_card(self, entity, card):
2526
person = self.manager.make_entity("Person")

ingestors/manager.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
1-
import magic
21
import logging
3-
from timeit import default_timer
4-
from tempfile import mkdtemp
52
from datetime import datetime
6-
from pkg_resources import get_distribution
3+
from tempfile import mkdtemp
4+
from timeit import default_timer
75

8-
from followthemoney import model
6+
import magic
97
from banal import ensure_list
8+
from followthemoney import model
9+
from followthemoney.helpers import entity_filename
10+
from followthemoney.namespace import Namespace
11+
from ftmq.store.fragments.utils import safe_fragment
1012
from normality import stringify
1113
from pantomime import normalize_mimetype
12-
from ftmstore.utils import safe_fragment
14+
from pkg_resources import get_distribution
15+
from prometheus_client import Counter, Histogram
16+
from sentry_sdk import capture_exception
1317
from servicelayer.archive import init_archive
1418
from servicelayer.archive.util import ensure_path
1519
from servicelayer.extensions import get_extensions
16-
from sentry_sdk import capture_exception
17-
from followthemoney.helpers import entity_filename
18-
from followthemoney.namespace import Namespace
19-
from prometheus_client import Counter, Histogram
2020

21+
from ingestors import settings
2122
from ingestors.directory import DirectoryIngestor
22-
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
23+
from ingestors.exc import ENCRYPTED_MSG, ProcessingException
2324
from ingestors.util import filter_text, remove_directory
24-
from ingestors import settings
2525

2626
log = logging.getLogger(__name__)
2727

@@ -68,9 +68,9 @@ class Manager(object):
6868
"""Handles the lifecycle of an ingestor. This can be subclassed to embed it
6969
into a larger processing framework."""
7070

71-
#: Indicates that during the processing no errors or failures occured.
71+
#: Indicates that during the processing no errors or failures occurred.
7272
STATUS_SUCCESS = "success"
73-
#: Indicates occurance of errors during the processing.
73+
#: Indicates occurrence of errors during the processing.
7474
STATUS_FAILURE = "failure"
7575

7676
MAGIC = magic.Magic(mime=True)

ingestors/settings.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from servicelayer import env
22
from servicelayer import settings as sls
3-
from ftmstore import settings as fts
43

54
TESTING = False
65

@@ -43,25 +42,12 @@
4342
"INGESTORS_TYPE_MODEL_PATH", "/models/model_type_prediction.ftz"
4443
)
4544

46-
# Use the environment variable set in aleph.env
47-
fts.DATABASE_URI = env.get(
48-
"FTM_STORE_URI", env.get("ALEPH_DATABASE_URI", fts.DATABASE_URI)
49-
)
50-
5145
# Also store cached values in the SQL database
52-
sls.TAGS_DATABASE_URI = fts.DATABASE_URI
46+
sls.TAGS_DATABASE_URI = env.get("FTM_STORE_URI", env.get("ALEPH_DATABASE_URI"))
5347

54-
# ProcessingException is thrown whenever something goes wrong wiht
48+
# ProcessingException is thrown whenever something goes wrong with
5549
# parsing a file. Enable this with care, it can easily eat up the
5650
# Sentry quota of events.
5751
SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
5852
"SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
5953
)
60-
61-
WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
62-
# "auto" prompts the model to detect the language
63-
WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
64-
# timeout expressed in seconds
65-
WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
66-
"INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
67-
)

ingestors/support/email.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1+
import logging
12
import re
23
import types
3-
import logging
4+
from email.utils import getaddresses, parsedate_to_datetime
5+
46
from banal import ensure_list
5-
from normality import stringify
6-
from ftmstore.utils import safe_fragment
7-
from email.utils import parsedate_to_datetime, getaddresses
8-
from normality import safe_filename, ascii_text
97
from followthemoney.types import registry
8+
from ftmq.store.fragments.utils import safe_fragment
9+
from normality import ascii_text, safe_filename, stringify
1010

11-
from ingestors.support.html import HTMLSupport
1211
from ingestors.support.cache import CacheSupport
12+
from ingestors.support.html import HTMLSupport
1313
from ingestors.support.temp import TempFileSupport
1414

1515
log = logging.getLogger(__name__)
@@ -195,8 +195,8 @@ def extract_msg_headers(self, entity, msg):
195195
sender = self.get_header_identities(msg, "Sender", "X-Sender")
196196
self.apply_identities(entity, sender, "emitters", "sender")
197197

198-
froms = self.get_header_identities(msg, "From", "X-From")
199-
self.apply_identities(entity, froms, "emitters", "from")
198+
froms = self.get_header_identities(msg, "From", "X-From") # codespell:ignore
199+
self.apply_identities(entity, froms, "emitters", "from") # codespell:ignore
200200

201201
tos = self.get_header_identities(msg, "To", "Resent-To")
202202
self.apply_identities(entity, tos, "recipients", "to")

ingestors/worker.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import logging
2+
23
from followthemoney import model
3-
from ftmstore import get_dataset
4-
from servicelayer.worker import Worker
5-
from servicelayer.logs import apply_task_context
4+
from ftmq.store.fragments import get_dataset
65
from prometheus_client import Info
6+
from servicelayer.logs import apply_task_context
7+
from servicelayer.worker import Worker
78

89
from ingestors import __version__
9-
from ingestors.manager import Manager
1010
from ingestors.analysis import Analyzer
11+
from ingestors.manager import Manager
1112

1213
log = logging.getLogger(__name__)
1314
OP_INGEST = "ingest"

0 commit comments

Comments
 (0)