-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpodcast_statistics.py
47 lines (42 loc) · 1.57 KB
/
podcast_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
from tqdm import tqdm
import math
## Function to convert duration to seconds
def uniform_duration(duration):
try:
if isinstance(duration, float) or (":" not in duration and '"' not in duration and "'" not in duration):
return int(duration)
else:
parts = duration.split(":")
if len(parts) == 1:
parts = parts[0].split("'")
if len(parts) == 1:
return int(parts[0])
elif len(parts) == 2:
if parts[1] == '':
parts[1] = '0"'
return int(parts[0])*60 + int(parts[1][:-1])
elif len(parts) == 2:
minutes, seconds = parts
return int(minutes)*60 + int(seconds)
elif len(parts) == 3:
hours, minutes, seconds = parts
return int(hours)*3600 + int(minutes)*60 + int(seconds)
except Exception as e:
print(e)
return 0
if __name__ == "__main__":
## Read data
df = pd.read_csv('data/news_episodes.csv')
print(df.columns)
## Convert duration to seconds by using uniform_duration function
ud = []
for i, d in enumerate(tqdm(list(df.episode_duration))):
ud.append(uniform_duration(d))
df["uniform_duration"] = ud
## Sum uniform duration group by podcast_name
print("grouping...")
groups = df.groupby("podcast_name")
for i, tuple in enumerate(groups):
name, group = tuple
print(i, " - ", name, " - ", group.uniform_duration.sum()/3600)