-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_collector.py
100 lines (73 loc) · 2.88 KB
/
data_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import csv,time, sys, json, os, concurrent.futures
from multiprocessing import Pool, Lock
from pymongo import MongoClient
from utils.RepoDownload import CloneRepo
from refactoring_identifier.RefMiner import RefMiner
from utils.file_folder_remover import Remover
from utils.method_code_extractor import MethodExtractor
from joblib import Parallel, delayed
# Define the number of workers to use for parallel processing
NUM_WORKERS = 6
#Lock
lock = Lock()
# Function to clone a Git repo, run a Java program, and parse the output
def process_repo(repo_details):
print("Start analyzing for repo - "+repo_details[0])
#Clone the repo
try:
cloned_path = CloneRepo(repo_details[0], repo_details[1]).clone_repo()
except Exception as e:
print(e)
return
#Run RefactoringMiner
try:
ref_output_path = RefMiner().exec_refactoring_miner(cloned_path,repo_details[0])
except Exception as e:
print(e)
return
#Prase the Json and extract the methods
try:
me_obj = MethodExtractor(cloned_path,ref_output_path)
parsed_json_dict = me_obj.json_parser()
pos_method_body_list, neg_method_body_list = me_obj.extract_method_body(parsed_json_dict)
Remover(cloned_path).remove_folder()
Remover(ref_output_path).remove_file()
except Exception as e:
print(f"Error extracting positive and negative methods for {repo_details[0]}")
print(e)
return
#Store the methods
db_dict = {
"repo_name":repo_details[0],
"repo_url": repo_details[1],
"positive_case_methods":pos_method_body_list,
"negative_case_methods":neg_method_body_list
}
with lock:
# Locally in jsonl
out_jsonl_path = os.path.join(os.environ['SLURM_TMPDIR'],'extract-method-identification',"data","output")
if not os.path.exists(out_jsonl_path):
os.mkdir(out_jsonl_path)
with open(os.path.join(out_jsonl_path,output_file_name), 'a') as f: # os.environ["output_file_name"] doesn't work in MP
f.write(json.dumps(db_dict) + "\n")
f.flush()
f.close()
print("End analysis for repo - "+repo_details[0])
if __name__=="__main__":
print("Start")
print(sys.argv[1])
input_file = sys.argv[1]
output_file_name = sys.argv[2]
with open(input_file,"r") as f:
reader = csv.reader(f)
repo_details = [(row[0],row[1]) for row in reader]
t = time.time()
# Use multiprocessing to process the repos in parallel
# with Pool(NUM_WORKERS) as p:
# p.map(process_repo, repo_details)
#Use MultiThread
with concurrent.futures.ThreadPoolExecutor(NUM_WORKERS) as executor:
executor.map(process_repo, repo_details)
# #Use Joblib
# Parallel(n_jobs=NUM_WORKERS)(delayed(process_repo)(repo_detail) for repo_detail in repo_details)
print(time.time()-t)