-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbuildtrain.py
162 lines (133 loc) · 5.96 KB
/
buildtrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
import pandas as pd
from datetime import datetime
app = False # If starting from scratch, this should be set to 'False'
if app: print 'we are appending to an old train file, correct?'
else : print 'we are starting from scratch, correct?'
#### Start by downloading data from nba.com
def load(): ### first thing we do is go get all of the data for the entire year and save it into 2015games.csv
fout = open('2015games.csv', 'w')
r = requests.get('http://stats.nba.com/stats/leaguegamelog?Counter=1000000&Direction=DESC&LeagueID=00&PlayerOrTeam=P&Season=2015-16&SeasonType=Regular+Season&Sorter=PTS')
data = r.json().get('resultSets', [])[0]
headers = data.get('headers', [])
rows = data.get('rowSet', [])
fout.write(','.join(headers))
fout.write('\n')
for row in rows:
fout.write(','.join([str(r) for r in row]))
fout.write('\n')
def games_not_processed(app): ## returns a dataframe of games which have not been added to the processed list. Also returns a set of games, which will be added to the processed list once processing is complete.
try : glist = pd.read_csv('gamesprocessed.csv')
except : app = False
if app==False :
print 'starting from empty gamesprocessed file'
glist = pd.read_csv('emptygamesprocessed.csv')
gl1 = set(glist.GAME_ID)
wf = pd.read_csv('2015games.csv')
gl2 = set(wf.GAME_ID)
gdiff = gl2-gl1
df = wf[wf['GAME_ID'].isin(gdiff)] # This selects the subset of unprocessed games
return df, gdiff
def home_away(df): #### this determines home and away for each game and lists the opponent. Later (but not yet) we also give the opponent it's own "feature" column. It takes the dataframe of unprocessed games. It returns a dataframe
df['home']=1
df['away']=0
df['Opponent'] = df.MATCHUP
for z in range(len(df)):
if len(df.Opponent.iloc[z].split('vs.'))==2:
df.Opponent.iloc[z]=df.Opponent.iloc[z].split(' vs. ')[1] #this part is slow for the whole file, for some reason
else :
df.Opponent.iloc[z]=df.Opponent.iloc[z].split(' @ ')[1]
df.away.iloc[z]=1
df.home.iloc[z]=0
return df
def position(df): #### pulls position from a separately created file that remembers the position and adds dummy features to dataframe
posdf = pd.read_csv('position.csv')
df = pd.merge(df, posdf, on='PLAYER_NAME', how = 'left') #just something to be aware of: here we merge on name. Other places on ID. This caused me some confusion at one point.
df = pd.get_dummies(df, columns = ['Position'])
return df
def build_train(recent_df): #recent_df should be the raw data from nba.com for the games yet processed.
stats14 = pd.read_csv('2014statsD21.csv')
stats14 = stats14.drop('PLAYER_NAME', axis = 1) # we will be merging on ID instead so don't want two copies
recent_df['NewGame']=True
recent_df['rest']=0
recent_df['recent_minutes']=-1
gamelog = recent_df
gamelog['FanDuel']=2*gamelog['FGM']+gamelog['FG3M']+gamelog['FTM']+1.2*gamelog['REB']+1.5*gamelog['AST']+2*gamelog['BLK']+2*gamelog['STL']-gamelog['TOV']
df =pd.merge(gamelog, stats14, on='PLAYER_ID', how = 'left')
### I was having some problems with extraneous index columns creeping into the file when I merge. If there's a different number of these columns, it throws an error when we concatenate. So the point of the next code is to remove these "unnamed" columns. Then we will merge with the stored train file.
u_columns=[]
for c in df.columns.values:
if 'Unnamed' in c:
u_columns += [c]
for c in u_columns:
df = df.drop(c, axis =1)
if app :
dfold=pd.read_csv('trainsmall.csv')
u_columns=[]
for c in dfold.columns.values:
if 'Unnamed' in c:
u_columns += [c]
for c in u_columns:
dfold = dfold.drop(c, axis =1)
df=df.append(dfold)
#df.to_csv('look.csv')
df = df.sort_index(by=['PLAYER_ID','GAME_DATE'], ascending = [True, False])
###################################### Create rest days ############################# This part is slow, not sure how best to speed it up.
yes = True
for i in range((len(df))-1):
if df.NewGame.iloc[i]:
if df.PLAYER_ID.iloc[i]==df.PLAYER_ID.iloc[i+1]:
df.recent_minutes.iloc[i]=df.MIN.iloc[i+1]
try : d1 = datetime.strptime(df.GAME_DATE.iloc[i+1], "%m/%d/%Y")
except : d1 = datetime.strptime(df.GAME_DATE.iloc[i+1], "%Y-%m-%d")
try: d2 = datetime.strptime(df.GAME_DATE.iloc[i], "%m/%d/%Y")
except : d2 = datetime.strptime(df.GAME_DATE.iloc[i], "%Y-%m-%d")
df.rest.iloc[i] = ((d2 - d1).days)
else :
df.rest.iloc[i]=15
df.NewGame = False
try:df.to_csv('trainsmall.csv')
except:
print "close the file, silly"
df.to_csv('trainsmallbackup.csv')
print 'not saving the games processed to list'
quit()
df =pd.get_dummies(df, columns = ['Opponent','PLAYER_ID'])
try:df.to_csv('train.csv')
except:
print "close the file, silly"
df.to_csv('trainbackup.csv')
print 'not saving the games processed to list'
quit()
load()
df1, b = games_not_processed(app)
print 'processing games' , list(b)
print 'computing home away'
df1 = df1
df = home_away(df1)
print 'adding in position file'
df = position(df)
print 'building train'
build_train(df)
print 'writing games processed to csv'
games = pd.read_csv('gamesprocessed.csv')
gameslist = list(games.GAME_ID)
gameslist = gameslist + list(b)
games = pd.DataFrame(gameslist, columns = ['GAME_ID'])
games.to_csv('gamesprocessed.csv')
quit()
def build_position_file(): ## this was one time shot. Took several FanDuel files to learn the position data, and then created a file with the position data
p1 = pd.read_csv('p1.csv')
p2 = pd.read_csv('p2.csv')
p3 = pd.read_csv('p3.csv')
p = pd.concat([p1,p2,p3])
p['PLAYER_NAME']=p['First Name']+' '+p['Last Name']
p = p.sort_index(by = ['PLAYER_NAME'])
playerlist =[]
positionlist =[]
for h in range(len(p)):
if p.PLAYER_NAME.iloc[h]!=p.PLAYER_NAME.iloc[h-1]:
playerlist += [p.PLAYER_NAME.iloc[h]]
positionlist +=[p.Position.iloc[h]]
plist = pd.DataFrame.from_items([('PLAYER_NAME', playerlist),('Position', positionlist)])
plist.to_csv("position.csv")