-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibraryOrganizer.m
241 lines (216 loc) · 10 KB
/
libraryOrganizer.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
function bibStruct = libraryOrganizer
% LIBRARYORGANIZER Adds machine-generated keywords for pdf documents
% function bibStruct = libraryOrganizer
%
% Performs topic modelling on a collection of text files specified via
% user input. Best suited for academic texts which include a DOI, e.g.
% research articles. Creates BibTex file with automatically generated
% keywords for later import to reference managers.
% Topic Modelling is a way to identifiy common themes across multiple
% documents based on word frequencies. With the used approach to topic
% modelling, an individidual document can belong to multiple topics
% (probabilistic model)
%
% Topic Modelling Method: Latent Dirichlet Analysis (MATLAB fitlda)
%
% -Choose a folder with text files
% -Optionally: Choose exisitng BibTex files
% -prepare text and perform topic modelling
% -visualize documents and topics
% -create output BibTex file
%
% ====INPUT=====
% User input via GUI elements
%
% ====OUTPUT====
% bibStruct struct Contains bibliographic information for
% user-defined text files and
% automatically generated keywords.
% author: Joshua Pepe Woller
%
% Default number of topics to be extracted in topic modelling
nTopicDef = 5;
%% READ PDF FILES
% Depth of Search, specify level down to which files should be searched
searchDepth = questdlg({"Please specify the search depth for retrieving " + ...
"pdf files."; "Full: Look into folder and all subfolders (and their subfolders...)" + ...
" (e.g., the fully nested folder structure)"; ...
"First: Only consider files directly in the folder, ";
"Second: Look into folder and direct subfolders for files " + ...
"as well as direct subfolders"}, "Search depth setting", ...
"full", "first", "second", "full");
% Retrieve files from specified folder structure at given search depth
% fileNames: list of file names
% nFiles: Get number of files found
[~, fileNames, nFiles] = fileSearch("searchDepth", searchDepth);
%% RETRIEVE RAW TEXT AND BIBLIOGRAPHIC INFORMATION
% Optional input of existing BibTex formatted file that describes the pdf
% files in the folder (e.g., generated by a reference manager program)
% Should be in .bib or .txt format.
ownBib = questdlg({"Do you want to choose an existing BibTex file?"; ...
"If not, information will be retrieved from an online service " + ...
"(crossref.org)."}, ...
"Choose BibTex file", "Yes", "No", "No");
% Either try to read info from existing file, or get online info
switch ownBib
case "Yes"
[bibFile, bibPath] = uigetfile(["*.bib"; "*.txt"] , ...
"OPTIONAL: Choose local BibTex file, if available (.bib/.txt).");
% If no file was chosen, uigetfile creates a 0,0 value pair.
% If a location for an existing BibTex file was chosen it is parsed to
% struct.
if bibFile ~= 0
bibFile = fullfile(bibPath, bibFile);
% Extract text from pdf files, but do not download additional info
textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", false);
% Convert .bib or .txt file to struct
bibStruct = parseBibTex(bibFile);
% Fill empty fields with <missing> values to allow for better indexing.
% Otherwise, empty fields are simply omitted if contents of struct are
% listed (e.g., via accessing the DOI field with bibStruct.doi)
fieldNames = fieldnames(bibStruct);
for fieldIdx = 1:length(fieldNames)
currField = fieldNames(fieldIdx);
currField = currField{:};
% Logical indexing to find empty fields
emptyIndex = arrayfun(@(bibStruct) ...
isempty(bibStruct.(currField)),bibStruct);
[bibStruct(emptyIndex).(currField)] = deal(missing);
end
% Find BibTex entries that have the same DOI or file location as
% extracted from pdf files.
inBibDoi = ismember([bibStruct.doi], [textStruct.doi]);
% Remove escape character "\" present in MS windows style paths.
fileLoc = strrep([bibStruct.file], "\\", "\");
inBibFile = ismember(fileLoc, [textStruct.file]);
% Set aside BibTex info of documents that were not matched to files in
% the folder structure. Later, this gets again written to the BibTex
% file.
origBibTex = bibStruct(~inBibDoi);
% Match parsed pdf files to their entries in user-defined BibTex by
% their DOI and file location
bibStruct = bibStruct(inBibDoi|inBibFile);
clear fieldNames
else
% If no valid file chosen, download BibTex information.
textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", true);
bibStruct = parseBibTex(textStruct);
end
clear bibPath bibFile
case "No"
% Extract text from pdf files.
% If no BibTex info was given by user, we try to download it from
% crossref.org using the DOI extracted from pdf texts.
textStruct = getBibInfo(nFiles, fileNames, "downloadBibTex", true);
bibStruct = parseBibTex(textStruct);
end
%% TEXT PREPARATION AND TOKENIZATION
% With many and large texts, topic modelling can be faster if only nouns
% are used. This however leads to weird n-Grams due to the deletion of
% adjectives and verbs between nouns.
tokOption = questdlg({"Use only nouns for tokenization and topic modelling?"; ...
"Topic Modelling using only nouns from a tokenized document" + ...
" can be faster, but n-Grams become less comprehensible."; ...
"'All words' recommended as default."}, ...
"Tokenization Options", "All words", "Nouns only", "All words");
nounOnly = tokOption == "Nouns only";
% Tokenizing simplifies and unifies word forms, making them suitable for
% algorithmic analysis. Short and infrequent words get deleted.
% For optional arguments, see corresponding documentation.
% textStruct.text contains parsed text of individual pdf files
tokText = preprocessingText([textStruct.text], "NounOnly", nounOnly);
% Bag-of-Words and Bag-of-N-Gram models reduce text to a frequency count
% for subsequent analysis.
[wordBag, nGrams] = wordBagPack(tokText);
% Topic Modelling can pe performed on individual words (e.g., "Neuron") or
% on n-Grams (e.g., the bi-Gram "Neuron doctrine")
% -
% User defined choice of basis for topic modelling. Ask until choice is
% made or program is quit.
while ~exist("textItemType", "var") || isempty(textItemType)
% Dialog to choose text unit for Topic Modelling; closing the window
% leads to empty string output
textItemType = questdlg("Which text units should be used for topic modelling?", ...
"Topic Modelling Selection", 'words', 'n-Grams', 'words');
if isempty(textItemType)
% If no option was chosen, ask again and give opportunity to quit
% execution
quitTopic = questdlg("No method was chosen. Quit program?", ...
"Exit Topic Modelling", "Go back", "Quit", "Go back");
if quitTopic == "Quit"
% Give error and exit function execution
errordlg("User did not specify text unit for topic modelling. " + ...
"Program terminated.");
return
end
end
end
%% TOPIC MODELLING
% GUI to set number of topics (integer); returns cell array
nTopics = inputdlg('How many topics should be extracted?', ...
'Number of Topics',[1 50], {'5'});
% Unpack cell array
nTopics = nTopics{:};
% Convert to number; if no number present, nTopics is empty
nTopics = str2double(nTopics);
if isempty(nTopics) || ~(nTopics == int64(nTopics))
% If no valid integer was passed, use default value
warndlg("'"+nTopics +"'"+ " is not a valid input " + ...
"for number of topics. " + ...
"Using default value of "+nTopicDef+".")
nTopics = nTopicDef;
end
if nTopics >= wordBag.NumDocuments/2
warning("Less than two documents per topic on average." + ...
" Consider reducing the number of topics or increasing" + ...
" the number of documents.")
end
% Perform topic modelling based on user specified text units and number of
% topics.
switch textItemType
case "words"
topicModel = fitlda(wordBag, nTopics,"Verbose",0);
case "n-Grams"
% Warn user that nounOnly and n-Grams don't get along
if exist("nounOnly", "var") && nounOnly
warndlg("Extracting only nouns not recommended for N-gram based models!")
end
topicModel = fitlda(nGrams, nTopics, "Verbose", 0);
end
%% INSPECT AND RENAME TOPICS
% Show word clouds describing all topics; User can define names for the
% topics that later serve as keywords for the Bibtex entries.
% TSNE plot illustrates distance between individual topic clusters.
[~, topicLabels] = plotTopic (topicModel, bibStruct);
%% BIBTEX FILE EXPORT
% Add generated keywords (frequent words, nGrams and associated topics) to
% bibStruct.
bibStruct = bibUpdate(bibStruct, wordBag, nGrams, topicLabels, fileNames);
% Select file to write Bibtex to.
while ~exist("filePath", "var") || ~all(writeFile) %== 0
[writeFile, writePath] = uiputfile('*.bib', "Choose Location to save " + ...
"BibTex file", "keywordedLibrary");
filePath = fullfile(writePath, writeFile);
if ~all(writeFile)
% If no file was chosen, ask again; Offer opportunity to quit
quitTopic = questdlg("No file for saving BibTex specified. Quit program?", ...
"Exit?", "Go back", "Quit", "Go back");
if quitTopic == "Quit"
errordlg("User did not specify BibTex file location. " + ...
"Program terminated.");
return
end
end
end
% Write keyworded bibStruct to file
writeBibTex(bibStruct, filePath)
% If we received a user-defined BibTex file, we append unmatched entries
% (e.g. where no corresponding files in our folder were found)
% to the new BibTex file so that the initial library is complete again.
if exist("origBibTex", "var")
writeBibTex(origBibTex, filePath, "mode", 'a+')
end
% Notify user of success; OK to end function
uiwait(msgbox("Done! BibTex successfully written to: "+filePath+" .", ...
"Success", "modal"))
end