Skip to content

Commit

Permalink
Updates scraping in construction of DSing
Browse files Browse the repository at this point in the history
  • Loading branch information
groadabike committed Oct 17, 2019
1 parent 07214c2 commit bc08f22
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 77 deletions.
7 changes: 2 additions & 5 deletions DSing Construction/runme_sing2dsing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ echo "DSing30 = The English spoken recordings from GB plus all the other English

version=

DSing_dest=/media/gerardo/SoloSinging/${version}
DSing_dest=/media/gerardo/SoloSinging/DSing_Task/${version}
SmuleSing_path=/media/gerardo/SoloSinging/DAMP/sing_300x30x2

# A- Prepare the workspace
Expand All @@ -25,10 +25,7 @@ python copy_lyrics.py $DSing_dest $SmuleSing_path
python identify_wordlevel_lyrics.py $DSing_dest

# D- Download sentence-level prompt-lyrics from Smule
#
# I need to change this step.
# Smule changes the divs and blocks scrapping
# python scraping_lyrics.py $workspace $db_path
python scraping_lyrics.py $DSing_dest

# E- Transform word to sentence level
python word_to_sentence_level.py $DSing_dest
Expand Down
138 changes: 79 additions & 59 deletions DSing Construction/scraping_lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from os import listdir, makedirs
import argparse
from user_agent import generate_user_agent
import json


def get_countries(workspace_path):
Expand All @@ -29,70 +30,81 @@ def download_lyrics(args):
for country in countries:
print("[English Subset] Recovering from country {}".format(country))
word_level_list = file2list(join(workspace, "data", country, "word_level.txt"))
path_downloaded_lyrics = join(workspace, "DownloadLyric")
create_folder(path_downloaded_lyrics)
recovered_lyrics = [f for f in listdir(path_downloaded_lyrics) if f.endswith('.txt')]

for word_level in word_level_list:
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
# Arrangement ID
arrangement = word_level.split('.')[0]
# Metadata file path
metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")

# Read metadata file for current word_level prompt
try:
metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
except ValueError:
# Metadata file has format errors
# create empty dict
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
metadata = {}

# Catch error if title is not in Metadata
try:
title = format_text(metadata['Arrangement title'].lstrip())
except KeyError:
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
title = ""

# Catch error if artist is not in Metadata
try:
artist = format_text(metadata['Arrangement artist'].lstrip())
except KeyError:
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
artist = ""

url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
format(artist, title, arrangement)

# try to get the lyrics several time in case of errors from the network connection
attempts = 50
while attempts > 0:
response = requests.get(url, timeout=5, headers=headers)
html = response.content
soup = BeautifulSoup(html, "html5lib")
mydiv = soup.find_all("div", {"class": "main"})

# Just download the lyrics on the first occurrence, no per country
if arrangement + ".txt" not in recovered_lyrics:
# Metadata file path
metadata_path = join(workspace, 'data', country, country + "ArrangementMeta", arrangement + ".txt")

# Read metadata file for current word_level prompt
try:
metadata = dict(map(str, x.split(':', 1)) for x in file2list(metadata_path))
except ValueError:
# Metadata file has format errors
# create empty dict
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
metadata = {}

# Catch error if title is not in Metadata
try:
title = format_text(metadata['Arrangement title'].lstrip())
except KeyError:
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
title = ""

# Catch error if artist is not in Metadata
try:
artist = format_text(metadata['Arrangement artist'].lstrip())
except KeyError:
if arrangement not in metadata_with_errors:
metadata_with_errors.append(arrangement)
artist = ""

url = 'https://www.smule.com/song/{}-{}-karaoke-lyrics/{}/arrangement'.\
format(artist, title, arrangement)

# try to get the lyrics several time in case of errors from the network connection
attempts = 5
while attempts > 0:
response = requests.get(url, timeout=5, headers=headers)
html = response.content
soup = BeautifulSoup(html, "html.parser")
mydiv = soup.find_all("script")#, {"class": "_1frabae"})
if len(mydiv) < 1:
attempts -= 1
else:
attempts = 0

if len(mydiv) < 1:
attempts -= 1
mydiv = soup.find_all("div", {"class": "column error-gone"})
print("[WARNING] can't find {}".format(url))
for div in mydiv:
path_to_error_download = join(workspace, "data", country, "error_download.txt")
with open(path_to_error_download, "a") as error_file:
error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
arrangement, div.h1.get_text(), div.p.get_text()
))
else:
attempts = 0

if len(mydiv) < 1:
mydiv = soup.find_all("div", {"class": "column error-gone"})
print("[WARNING] can't find {}".format(url))
for div in mydiv:
path_to_error_download = join(workspace, "data", country, "error_download.txt")
with open(path_to_error_download, "a") as error_file:
error_file.write("arrangement: {}\terror: {}\tdetails: {}".format(
arrangement, div.h1.get_text(), div.p.get_text()
))
else:
for div in mydiv:
lyric = div.get_text(strip=True, separator="\n").split("\n")
path_new_lyric = join(workspace, "data", country, country + "DownloadLyric", arrangement + ".txt")
#print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
list2file(path_new_lyric, lyric)
for div in mydiv:
lyric_text = div.get_text()#.replace("\n","")#.split("\n")
if "<p>" in lyric_text:
lyric = lyric_text[lyric_text.find("<p>")+3:lyric_text.find("</p>")-4].split("<br>")
path_new_lyric = join(path_downloaded_lyrics,
arrangement + ".txt")
print("[Recover lyric] url {} - > save in {}".format(url, path_new_lyric))
list2file(path_new_lyric, lyric)
break



def format_text(text):
Expand All @@ -114,11 +126,19 @@ def create_folder(fd):
makedirs(fd)


def clean_text(text):
if "&#39;" in text:
text = text.replace("&#39;", "'")
if "&quot;" in text:
text = text.replace("&quot;", '"')
return(text)


def list2file(path, data):
create_folder(dirname(path))
with open(path, "w") as file:
for item in data:
file.write("{}\n".format(item))
file.write("{}\n".format(clean_text(item)))


def file2list(filepath):
Expand All @@ -143,7 +163,7 @@ def file2list(filepath):
' to recover the lyrics as appears in the Smule website'
)

parser.add_argument("workspace", type=str, help="Path to Workspece")
parser.add_argument("workspace", type=str, help="Path to Workspace")
parser.add_argument('--version', action='version',
version='%(prog)s 1.0')

Expand Down
43 changes: 30 additions & 13 deletions DSing Construction/word_to_sentence_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def file2list(filepath):
outlist = []
with open(filepath) as file:
for line in file:
outlist.append(line.replace('\n', ''))
line = line.replace('\n', '')
if line:
outlist.append(line)
return outlist


Expand All @@ -69,16 +71,18 @@ def create_original_json(args):

for country in countries:
print("[English Subset] Doing word2sentence lyrics of country {}".format(country))
arrangement_list = [f for f in listdir(join(workspace, "data", country, country + "DownloadLyric"))
if f.endswith('.txt')]
arrangement_list = file2list(join(workspace, "data", country, "word_level.txt"))

for arrangement in arrangement_list:
new_text_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement)
original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement.split(".")[0] + ".json")
reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric', arrangement.split(".")[0] + ".json")
create_folder(dirname(original_reconstructed_lyrics_path))
with open(original_reconstructed_lyrics_path, 'w') as outfile:
json.dump(reconstructed_annotation, outfile, indent=4)
new_text_path = join(workspace, 'DownloadLyric', arrangement.split(".")[0] + ".txt")
if exists(new_text_path):
original_annotation_path = join(workspace, "data", country, country + 'Lyrics', arrangement)
reconstructed_annotation = reconstruct_original_lyrics(new_text_path, original_annotation_path)
original_reconstructed_lyrics_path = join(workspace, "data", country, country + 'DownloadLyric',
arrangement.split(".")[0] + ".json")
create_folder(dirname(original_reconstructed_lyrics_path))
with open(original_reconstructed_lyrics_path, 'w') as outfile:
json.dump(reconstructed_annotation, outfile, indent=4)

one_word_recovered = [f for f in listdir(join(workspace, "data", country, country + 'DownloadLyric'))
if f.endswith(".json")]
Expand All @@ -94,7 +98,6 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
text_lyrics = file2list(text_lyrics_path)
it_text_lyrics = iter(text_lyrics)
current_text = next(it_text_lyrics)

reconstruct = []
element = {'t': 0.0,
'l': ""}
Expand All @@ -104,12 +107,26 @@ def reconstruct_original_lyrics(text_lyrics_path, json_lyrics_path):
except json.decoder.JSONDecodeError:
print(data_file)
for item in data:
if item['l'] in current_text:
if item['l'] == current_text[:len(item['l'])]:
if element['t'] == 0:
element['t'] = item['t']
element['l'] = current_text
current_text = current_text[len(item['l']):].lstrip()
if len(current_text) == 0:

# if item['l'] == current_text[1:len(item['l'])]:
# if element['t'] == 0:
# element['t'] = item['t']
# element['l'] = current_text
# current_text = current_text[len(item['l'])+1:].lstrip()
#
# if item['l'][1:] == current_text[:len(item['l'])-1]:
# if element['t'] == 0:
# element['t'] = item['t']
# element['l'] = current_text
# current_text = current_text[len(item['l'])-1:].lstrip()


if len(current_text) == 0 or current_text == ',' or current_text == '.':
reconstruct.append(element)
element = {'t': 0.0,
'l': ""}
Expand Down

0 comments on commit bc08f22

Please sign in to comment.