-
-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathcheck_videos.py
91 lines (75 loc) · 2.83 KB
/
check_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import csv
import os
import re
import sys
# TBD fetch all the videos in the youtube channel
# For now we can download files via https://takeout.google.com/
# Apparently not all the playlists are inclided in the first csv file in the download
# list the videos that are in none of the playlists
# check if all the videos on youtube have a page in code-maven.
# All the videos from the Hebrew YouTube channel should be published on the code-maven site in the sites/he/pages folder.
# The videos from the English YouTube channel are divided between the code-maven.com site and the perlmaven.com site
exclude = [
'Video Id', # csv title row
'muliGK6lz0U', # short
'foR9n0ws1KQ', # short
'qmmPz6ozNXY', # welcome video
]
def get_videos_from_code_maven(language):
pages_dir = os.path.join('sites', language, 'pages')
files = os.listdir(pages_dir)
# print(files)
entries = []
for file in files:
if not file.endswith('.txt'):
continue
with open(os.path.join(pages_dir, file)) as fh:
for row in fh:
match = re.search(r'<screencast file="([^"]+)" youtube="([^"]+)" />', row)
if match:
entries.append(match.group(2))
return entries
def get_videos_from_youtube():
ids = []
with open('Video Metadata.csv') as fh:
rd = csv.reader(fh)
for row in rd:
if row:
vid = row[0].strip()
if vid in exclude:
continue
ids.append(vid)
return ids
def read_playlists():
playlists = {}
videos = set()
for playlist_file in os.listdir('playlists'):
name = playlist_file[0:-4]
#print(name)
playlists[name] = []
with open(os.path.join('playlists', playlist_file)) as fh:
in_data = False
for row in fh:
if row.startswith("Video Id,Time Added"):
in_data = True
continue
if in_data:
if re.search(r'\S', row):
(vid, _) = row.strip().split(",", 2)
# print(vid)
playlists[name].append(vid)
videos.add(vid)
return playlists, videos
def main():
language = 'he'
code_maven = set(get_videos_from_code_maven(language))
youtube = set(get_videos_from_youtube())
playlists, videos = read_playlists()
print(sorted(code_maven - youtube))
# This is strange it shows that we have videos on the site that are not in the YouTube channel.
# I checked the first one and it was on YouTube as well so it unclear why does this say
print("Videos on YouTube but not on the site")
print(sorted(youtube - code_maven))
print("Videos not in any playlist")
print(sorted(youtube - videos))
main()