forked from SomeOddCodeGuy/OfflineWikipediaTextApi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_linux.sh
executable file
·84 lines (73 loc) · 2.73 KB
/
run_linux.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
# Stop script on any error code and trap errors for easier debugging
set -eE
trap 'echo >&2 "Error - exited with status $? at line $LINENO"' ERR
# Step 0: Parse any arguments we care about
DATABASE_DIR="."
WIKI_DATA_SET_DIR="$DATA_DIR/wiki-dataset"
TXTAI_WIKIPEDIA_DIR="$DATA_DIR/txtai-wikipedia"
OTHER_ARGS=()
function help() {
echo "usage: $0 [-h] [-d DATABASE_DIR]"
echo
echo "Offline Wikipedia Text API"
echo
echo "options:"
echo "-h, --help show this help message and exit"
echo "-d DATABASE_DIR, --database_dir DATABASE_DIR"
echo " Base directory containing the wiki-dataset and txtai-wikipedia"
echo " folders."
}
while [[ $# -gt 0 ]]; do
case $1 in
--database_dir|-d)
DATABASE_DIR="$2"
shift 2
;;
--help|-h)
help
exit 0
;;
*)
# For any unrecognized args, store them to pass through
OTHER_ARGS+=("$1")
shift
;;
esac
done
# Step A: Create and activate a Python virtual environment
echo Creating virtual environment
if [ ! -f "venv" ]; then
python -m venv venv
else
echo Existing venv detected. Activating.
fi
echo Activating virtual environment
source venv/bin/activate
# Step B: Install requirements from requirements.txt
echo ---------------------------------------------------------------
echo Installing python requirements from requirements.txt
pip install -r requirements.txt
# Step C: Clone the git repository for full wiki articles into a directory called "wiki-dataset"
echo ---------------------------------------------------------------
echo Downloading Wikipedia dataset. As of 2024-11-14, this is about 44GB
if [ ! -d "$WIKI_DATA_SET_DIR" ]; then
git clone https://huggingface.co/datasets/NeuML/wikipedia-20240901 "$WIKI_DATA_SET_DIR"
else
echo Existing wiki-dataset directory detected.
fi
# Step D: Clone the git repository for txtai wiki summaries into a directory called txtai-wikipedia
echo ---------------------------------------------------------------
echo Downloading txtai-wikipedia dataset. As of 2024-11-14, this is about 15GB.
if [ ! -d "$TXTAI_WIKIPEDIA_DIR" ]; then
git clone https://huggingface.co/NeuML/txtai-wikipedia "$TXTAI_WIKIPEDIA_DIR"
else
echo Existing txtai-wikipedia directory detected.
fi
# Finally: Start the API
echo ---------------------------------------------------------------
echo Starting API. If this is the first run, setup may take 10-15 minutes depending on your machine.
echo Setup time is due to indexing wikipedia article titles into a json file for API speed.
echo ---------------------------------------------------------------
echo API Starting...
python start_api.py --database_dir "$DATABASE_DIR" "${OTHER_ARGS[@]}"