-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcbc.yaml
More file actions
72 lines (72 loc) · 2.34 KB
/
cbc.yaml
File metadata and controls
72 lines (72 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
author: Paul Kalhorn
description: Language preprocessing for .txt or .bib files
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
entrypoints:
preprocess_bib_file:
description: Entrypoint for preprocessing an attribute of a .bib file
envs:
BIB_DOWNLOAD_PATH: /tmp/input.bib
FILTER_STOPWORDS: true
LANGUAGE: en
NGRAM_MAX: 3
NGRAM_MIN: 2
UNIGRAM_NORMALIZER: lemma
USE_NGRAMS: true
inputs:
bib_input:
config:
bib_file_BUCKET_NAME: null
bib_file_FILE_EXT: bib
bib_file_FILE_NAME: null
bib_file_FILE_PATH: null
bib_file_S3_ACCESS_KEY: null
bib_file_S3_HOST: null
bib_file_S3_PORT: null
bib_file_S3_SECRET_KEY: null
bib_file_SELECTED_ATTRIBUTE: Abstract
description: The bib file, aswell as one attribute selected for preprocessing
type: file
outputs:
normalized_docs_output:
config:
normalized_docs_DB_TABLE: null
normalized_docs_PG_HOST: null
normalized_docs_PG_PASS: null
normalized_docs_PG_PORT: null
normalized_docs_PG_USER: null
description: Database Output, containing bib_id aswell as the normalized text
type: pg_table
preprocess_txt_file:
description: Entrypoint to preprocess a .txt file
envs:
FILTER_STOPWORDS: true
LANGUAGE: en
NGRAM_MAX: 3
NGRAM_MIN: 2
TXT_DOWNLOAD_PATH: /tmp/input.txt
UNIGRAM_NORMALIZER: lemma
USE_NGRAMS: true
inputs:
txt_input:
config:
txt_file_BUCKET_NAME: null
txt_file_FILE_EXT: txt
txt_file_FILE_NAME: null
txt_file_FILE_PATH: null
txt_file_S3_ACCESS_KEY: null
txt_file_S3_HOST: null
txt_file_S3_PORT: null
txt_file_S3_SECRET_KEY: null
description: A .txt file, each line will be treated as a document
type: file
outputs:
normalized_docs_output:
config:
normalized_docs_DB_TABLE: null
normalized_docs_PG_HOST: null
normalized_docs_PG_PASS: null
normalized_docs_PG_PORT: null
normalized_docs_PG_USER: null
description: Database Output, containing bib_id aswell as the normalized text
type: pg_table
name: Language-Preprocessing