-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexCore18.sh
executable file
·37 lines (30 loc) · 1005 Bytes
/
indexCore18.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env bash
COLLECTION_PATH=$1
INDEX="/"$2
COLLECTION_FORMAT=$3
COLLECTION_PATH_WRITABLE=$1"-WRITABLE"
mkdir ${COLLECTION_PATH_WRITABLE}
#first we need to parse the collection
cwd=$(pwd)
cd /
npm install striptags
nodejs /parseCore18.js ${COLLECTION_PATH}/data/* ${COLLECTION_PATH_WRITABLE}
cd ${cwd}
#retrieve stopword list (stored in current directory)
wget http://www.lemurproject.org/stopwords/stoplist.dft
echo "Retrieved stopword list"
#create the parameter file for indexing
touch index.param
echo "<parameters>" >> index.param
echo "<index>${INDEX}</index>" >> index.param
echo "<stemmer><name>krovetz</name></stemmer>" >> index.param
echo "<corpus>" >> index.param
echo "<path>${COLLECTION_PATH_WRITABLE}</path>" >> index.param
echo "<class>${COLLECTION_FORMAT}</class>" >> index.param
echo "</corpus>" >> index.param
echo "</parameters>" >> index.param
#printout
more index.param
#start indexing
echo "Core18 ... Indexing"
/work/Indri/bin/IndriBuildIndex index.param stoplist.dft