-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcitation.py
126 lines (105 loc) · 4.68 KB
/
citation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#@TODO: How to handle exceptions for when regular expressions fail to find matches...
#@TODO: create try/catch blocks for regular expression failures
import re
# class representing a citation
class Citation(dict):
rawData = None
# testing purposes
debug = False
# regex script for breaking apart fields
rex_initial = re.compile('(?P<authors>.*)\((?P<date>.*)\)(?P<postData>.*)', re.DOTALL)
# constructor
def __init__(self, **kwargs):
# for every key, populate the respective Citation field with its value
for k, v in kwargs.iteritems():
self[k] = v
def __delitem__(self, key):
if key in self:
super(Citation, self).__delitem__(key)
__getattr__ = dict.__getitem__
__setattr__ = dict.__setitem__
# returns citation obj - subclass of dictionary
# parse and separate fields from raw data
@classmethod
def parse(cls, raw):
# create an empty Citation
self = cls()
self['raw'] = raw
# separate fields
matches = self.rex_initial.match(self.raw)
# grab authors field
authors = matches.group('authors')
self.parse_authors(authors)
# grab date field
issued = matches.group('date')
self.parse_date(issued)
# grab postData field
post = matches.group('postData')
self.parse_titleAndJournal(post)
# create unique id for citation
self.setUID()
# create a reference list of references
self.setReferences()
return self
#@TODO: Need to parse string and store name in array as, family:lastname given:first/second initial
def parse_authors(self, authors):
authorList = []
# (1) regex expression for grabbing 'Familyname, I1.' || 'Familyname, I1. I2.'
rex_author_list = re.compile('[A-Z][a-z]+,\s[A-Z]\.\s+[A-Z]\.|[A-Z][a-z]+,\s[A-Z]\.', re.DOTALL)
# (2) populate list with unique authors
matches = re.findall(rex_author_list, authors)
# (3) regex for separating an authors family name from his given initial(s)
rex_authors = re.compile('(?P<family>[A-Z][a-z]*),\s+(?P<givenList>.+\.)', re.DOTALL)
# (4) regex for separating given initials
rex_given_list = re.compile('([A-Z])\.', re.DOTALL)
# (5) for each author found, dissect family and given names and add to 'authors' dict
for author in matches:
# (5.1) separate family name from given initial(s)
uniqueAuthor = rex_authors.match(author)
# (5.2) find all initials of an authors given name and populate an array with strings
givenMatches = re.findall(rex_given_list, uniqueAuthor.group('givenList'))
# (5.3) for each given initial add it to a composite string
givenComposite = ''
for given in givenMatches:
givenComposite = givenComposite + ' ' + given
# (5.4) strip leading whitespace
givenComposite = givenComposite.lstrip()
# (5.5) add given initials and family to dictionary
authorComplete = {unicode('given'): givenComposite, unicode('family'): uniqueAuthor.group('family')}
# (5.6) append completed author to composite author dictionary
authorList.append(authorComplete)
self[unicode('author')] = authorList
if self.debug:
for author in authorList:
print('Author: ')
print(author)
def parse_date(self, date):
# store issued
self[unicode('issued')] = int(date)
if self.debug:
print('Date: ' + date)
def parse_titleAndJournal(self, post):
# parse title and journal
# REGEX expression for grabbing the title and journal from postdata string
rex_titleAndJournal = re.compile('\.\s*(?P<title>.*[\.\?!])\s*(?P<journal>.*),(.*),')
titleAndJournal = rex_titleAndJournal.match(post)
# grab, clean, and store title
title = titleAndJournal.group('title')
title = title.lstrip()
self[unicode('title')] = unicode(title)
# grab, clean, and store journal
containerTitle = titleAndJournal.group('journal')
containerTitle = containerTitle.lstrip()
self[unicode('container_title')] = unicode(containerTitle)
if self.debug:
print('Journal: ' + containerTitle)
print('Title: ' + title)
def setUID(self):
ID = str(self.issued) + "_" + self.title + "_" + self.container_title
self[unicode('UID')] = unicode(ID)
# testing
if self.debug:
print(self.UID)
def setReferences(self):
references = {}
self[unicode('references')] = references