-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcha_pipeline.cfg
70 lines (55 loc) · 2.3 KB
/
cha_pipeline.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
[cha_workflow]
xbrl_web_scraper = False
xbrl_validator = False
xbrl_unpacker = False
xbrl_parser = True
xbrl_file_appender = False
xbrl_melt_to_pivot = False
xbrl_subsets = False
pdf_web_scraper = False
pdfs_to_images = False
train_classifier_model = False
binary_classifier = False
binary_classifier_accuracy = False
ocr_functions = True
nlp_functions = True
merge_xbrl_to_pdf_data = True
[xbrl_web_scraper_args]
scraped_dir = /shares/data/20200519_companies_house_accounts/xbrl_scraped_data
xbrl_scraper = src/xbrl_scraper
allowed_domains = download.companieshouse.gov.uk/en_monthlyaccountsdata.html,download.companieshouse.gov.uk/historicmonthlyaccountsdata.html
start_urls = http://download.companieshouse.gov.uk/en_monthlyaccountsdata.html,http://download.companieshouse.gov.uk/historicmonthlyaccountsdata.html
[xbrl_validator_args]
scraped_dir = /shares/data/20200519_companies_house_accounts/xbrl_scraped_data
[xbrl_unpacker_args]
xbrl_unpacker_file_source_dir = /shares/xbrl_scraped_data/
xbrl_unpacker_file_destination_dir = /shares/xbrl_unpacked_data
[xbrl_parser_args]
#xbrl_parser_data_dir = /shares/xbrl_unpacked_data
#xbrl_parser_processed_csv_dir = /shares/xbrl_parsed_data
#xbrl_parser_tag_frequencies = /shares/logs
#xbrl_parser_tag_list = /shares/logs
xbrl_parser_data_dir = /shares/data/20200519_companies_house_accounts/xbrl_unpacked_data
xbrl_parser_processed_csv_dir = /home/peterd/test
xbrl_parser_tag_frequencies = /home/peterd/test
xbrl_parser_tag_list = /home/peterd/test
xbrl_parser_process_year = 2011
xbrl_parser_process_quarter = None
xbrl_parser_custom_input = None
[xbrl_file_appender_args]
xbrl_file_appender_indir = /shares/data/20200519_companies_house_accounts/xbrl_parsed_data/
xbrl_file_appender_outdir = /shares/data/20200519_companies_house_accounts/xbrl_appender_data/
xbrl_file_appender_year = 2011
xbrl_file_appender_quarter = 1
#[xbrl_melt_to_pivot_args]
#[xbrl_subsets_args]
[pdf_web_scraper_args]
filed_accounts_scraped_dir = /shares/data/pdf_download/pdfs_test2
filed_accounts_scraper = src/filing_fetcher_scraper/filing_fetcher_scraper
#[pdfs_to_images_args]
#[train_classifier_model_args]
#[binary_classifier_args]
#[binary_classifier_accuracy_args]
#[ocr_functions_args]
#[nlp_functions_args]
#[merge_xbrl_to_pdf_data_args]