-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_spotify_dataset.py
94 lines (84 loc) · 5.16 KB
/
create_spotify_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import time
CLIENT_ID = '512eb40fa7af4c8db0864090d056ccfb'
CLIENT_SECRET = '5afbc098885444848fcbcdf5efa94b3e'
SCOPE = "user-library-read user-read-recently-played user-follow-read"
REDIRECT_URI = "http://localhost/"
def get_filtered_features(features_list):
filtered_features = []
for feature_name in ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence", "tempo"]:
filtered_features.append(features_list[0][feature_name])
return filtered_features
def create_spotify_dataset():
"""
run this code once for each user
make sure to log out after each iteration
"""
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, scope=SCOPE, redirect_uri=REDIRECT_URI))
# get users' preferences from playlists, saved tracks, recently played tracks, and followed artists -> save info in a dataframe
spotify_df = pd.DataFrame(columns=["user_id", "track_id", "track_name", "artist", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence", "tempo", "is_on_playlist", "is_saved", "is_recently_played", "is_followed_artist"])
# get the user's name
user_id = sp.current_user()["display_name"]
# go through the user's playlists
for playlist in sp.current_user_playlists(limit=25)["items"]:
for track in sp.playlist_tracks(playlist["id"])["items"]:
track_id = track["track"]["id"]
if track_id is None:
continue
if ((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)).any():
continue
track_name = track["track"]["name"]
artist = track["track"]["artists"][0]["name"]
features = get_filtered_features(sp.audio_features(track_id))
interactions = [True, False, False, False]
spotify_df.loc[(len(spotify_df.index))] = [user_id, track_id, track_name, artist] + features + interactions
# go through the user's saved tracks
for offset_mult in range(6):
for track in sp.current_user_saved_tracks(limit=50, offset=(offset_mult*50))["items"]:
track_id = track["track"]["id"]
if track_id is None:
continue
if ((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)).any():
spotify_df.loc[((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)), "is_saved"] = True
continue
track_name = track["track"]["name"]
artist = track["track"]["artists"][0]["name"]
features = get_filtered_features(sp.audio_features(track_id))
interactions = [False, True, False, False]
spotify_df.loc[len(spotify_df.index)] = [user_id, track_id, track_name, artist] + features + interactions
# go through the user's recently played tracks
one_month_ago_timestamp = (int(time.time() * 1000)) - (3 * 2629800000)
for track in sp.current_user_recently_played(limit=50, after=one_month_ago_timestamp)["items"]:
track_id = track["track"]["id"]
if track_id is None:
continue
if ((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)).any():
spotify_df.loc[((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)), "is_recently_played"] = True
continue
track_name = track["track"]["name"]
artist = track["track"]["artists"][0]["name"]
features = get_filtered_features(sp.audio_features(track_id))
interactions = [False, False, True, False]
spotify_df.loc[len(spotify_df.index)] = [user_id, track_id, track_name, artist] + features + interactions
# go through the user's followed artists
for artist in sp.current_user_followed_artists(limit=25)["artists"]["items"]:
user_tracks_by_artist = spotify_df[((spotify_df['user_id'] == user_id) & (spotify_df['artist'] == artist["name"]))]['track_id'].to_numpy().tolist()
for track in user_tracks_by_artist:
spotify_df.loc[((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track)), "is_followed_artist"] = True
for track in sp.artist_top_tracks(artist["id"])["tracks"]:
track_id = track["id"]
if track_id is None:
continue
if ((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)).any():
spotify_df.loc[((spotify_df["user_id"] == user_id) & (spotify_df["track_id"] == track_id)), "is_followed_artist"] = True
continue
track_name = track["name"]
artist = track["artists"][0]["name"]
features = get_filtered_features(sp.audio_features(track_id))
interactions = [False, False, False, True]
spotify_df.loc[len(spotify_df.index)] = [user_id, track_id, track_name, artist] + features + interactions
# append the dataset to a csv
spotify_df.to_csv('./data/spotify_dataset.csv', mode='a', index=False, header=False)
create_spotify_dataset()