From 39660d5d5fe1c94c8eba97c57443935703e844b8 Mon Sep 17 00:00:00 2001 From: mir-am Date: Wed, 30 Jun 2021 18:16:17 +0200 Subject: [PATCH 01/31] Removing type annotatations that do not type check by mypy [WIP] - Part 1 --- libsa4py/cst_pipeline.py | 166 +++++++++++++++++- libsa4py/cst_transformers.py | 41 ++++- libsa4py/type_check.py | 7 +- libsa4py/utils.py | 22 +++ tests/examples/type_apply_ex.json | 6 +- tests/examples/type_apply_typed_ex.json | 223 ++++++++++++++++++++++++ tests/test_type_apply.py | 52 +++++- 7 files changed, 500 insertions(+), 17 deletions(-) create mode 100644 tests/examples/type_apply_typed_ex.json diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 4cbe4c3..b62f00f 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -4,8 +4,9 @@ import csv import time -from typing import List, Dict +from typing import List, Dict, Tuple from os.path import join +from tempfile import NamedTemporaryFile from pathlib import Path from datetime import timedelta from joblib import delayed @@ -14,7 +15,8 @@ from libsa4py.cst_transformers import TypeApplier from libsa4py.exceptions import ParseError, NullProjectException from libsa4py.nl_preprocessing import NLPreprocessor -from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file +from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \ + create_tmp_file, write_to_tmp_file, delete_tmp_file from libsa4py.pyre import pyre_server_init, pyre_query_types, pyre_server_shutdown, pyre_kill_all_servers, \ clean_pyre_config from libsa4py.type_check import MypyManager, type_check_single_file @@ -280,3 +282,163 @@ def run(self, jobs: int): proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json') proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True) ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons) + + +class TypeAnnotationsRemoval: + """ + Removes type annotations that cannot be type-checked by mypy + """ + + def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, apply_nlp: bool = True): + self.projects_path = projects_path + self.processed_projects_path = processed_projects_path + self.output_path = output_path + self.apply_nlp = apply_nlp + + def process_file(self, f:str, f_d_repr: dict): + f_read = read_file(join(self.projects_path, f)) + # TODO: The inital type-checking should not be done after adding no. type errors to the representation later on. + init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f), + MypyManager('mypy', MAX_TC_TIME)) + + if init_tc == False and init_no_tc_err is None: + return + else: + self.__remove_unchecked_type_annot(f_read, f_d_repr, ) + + + def run(self, jobs: int): + self.merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) + not_tced_src_f: List[Tuple[str, dict]] = [] + for p, p_v in list(self.merged_projects['projects'].items()): + for f, f_v in p_v['src_files'].items(): + if not f_v['tc']: + not_tced_src_f.append((f, f_v)) + + def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int): + tmp_f = create_tmp_file(".py") + out_f_code: str = "" + for m_v, m_v_t in f_d_repr['variables'].items(): + if m_v_t != "": + print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}") + f_d_repr['variables'][m_v] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['variables'][m_v] = m_v_t + + for i, fn in enumerate(f_d_repr['funcs']): + for p_n, p_t in fn['params'].items(): + if p_t != "": + print(f"Type-checking function parameter {p_n} with annotation {p_t}") + f_d_repr['funcs'][i]['params'][p_n] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['funcs'][i]['params'][p_n] = p_t + + for fn_v, fn_v_t in fn['variables'].items(): + if fn_v_t != "": + print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") + f_d_repr['funcs'][i]['variables'][fn_v] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t + + # The return type for module-level functions + if f_d_repr['funcs'][i]['ret_type'] != "": + org_t = f_d_repr['funcs'][i]['ret_type'] + print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}") + f_d_repr['funcs'][i]['ret_type'] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['funcs'][i]['ret_type'] = org_t + + # The type of class-level vars + for c_i, c in enumerate(f_d_repr['classes']): + for c_v, c_v_t in c['variables'].items(): + if c_v_t != "": + print(f"Type checking class variable {c_v} with annotation {c_v_t}") + f_d_repr['classes'][c_i]['variables'][c_v] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t + + # The type of arguments for class-level functions + for fn_i, fn in enumerate(c['funcs']): + for p_n, p_t in fn["params"].items(): + if p_t != "": + print(f"Type-checking function parameter {p_n} with annotation {p_t}") + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t + + # The type of local variables for class-level functions + for fn_v, fn_v_t in fn['variables'].items(): + if fn_v_t != "": + print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t + + # The return type for class-level functions + if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "": + org_t = f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] + print(f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}") + f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = "" + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + if tc: + return f_code + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + elif no_tc_err == init_no_tc_err: + f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t + + # apply_inferred_types(src_f_read, src_f_ext, src_f_o_path) + delete_tmp_file(tmp_f) + return out_f_code + + def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile): + f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr, + apply_nlp=self.apply_nlp)) + write_to_tmp_file(out_f, f_t_applied.code) + tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME)) + return tc, no_tc_err, f_t_applied.code + + + + + + + + + + diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index f115b91..23f18b8 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -962,9 +962,17 @@ def __get_var_type_an_assign(self, var_name: str): def __get_var_names_counter(self, node, scope): vars_name = match.extractall(node, match.OneOf(match.AssignTarget(target=match.SaveMatchedNode( match.Name(value=match.DoNotCare()), "name")), match.AnnAssign(target=match.SaveMatchedNode( - match.Name(value=match.DoNotCare()), "name")))) + match.Name(value=match.DoNotCare()), "name")) + )) + attr_name = match.extractall(node, match.OneOf(match.AssignTarget( + target=match.SaveMatchedNode(match.Attribute(value=match.Name(value=match.DoNotCare()), attr= + match.Name(value=match.DoNotCare())), "attr")), + match.AnnAssign(target=match.SaveMatchedNode(match.Attribute(value=match.Name(value=match.DoNotCare()), attr= + match.Name(value=match.DoNotCare())), "attr")))) return Counter([n['name'].value for n in vars_name if isinstance(self.get_metadata(cst.metadata.ScopeProvider, - n['name']), scope)]) + n['name']), scope)] + + [n['attr'].attr.value for n in attr_name if isinstance(self.get_metadata(cst.metadata.ScopeProvider, + n['attr']), scope)]) def visit_ClassDef(self, node: cst.ClassDef): self.cls_visited.append((self.__get_cls(node), @@ -987,6 +995,8 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu if fn_ret_type is not None: self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type)) return updated_node.with_changes(returns=fn_ret_type) + else: + return updated_node.with_changes(returns=None) return updated_node @@ -999,9 +1009,16 @@ def leave_Lambda(self, original_node: cst.Lambda, updated_node: cst.Lambda): def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, updated_node: cst.SimpleStatementLine): + + # Untyped variables if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( + target=match.DoNotCare())])])): + if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( target=match.Name(value=match.DoNotCare()))])])): - t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value) + t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value) + elif match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( + target=match.Attribute(value=match.Name(value=match.DoNotCare()), attr=match.Name(value=match.DoNotCare())))])])): + t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value) if t is not None: t_annot_node_resolved = self.resolve_type_alias(t) @@ -1015,9 +1032,14 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, equal=cst.AssignEqual(whitespace_after=original_node.body[0].targets[0].whitespace_after_equal, whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))] ) - elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])): - t = self.__get_var_type_an_assign(original_node.body[0].target.value) - if t is not None: + # Typed variables + elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare())])): + if match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])): + t = self.__get_var_type_an_assign(original_node.body[0].target.value) + elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Attribute(value=match.Name(value=match.DoNotCare()), + attr=match.Name(value=match.DoNotCare())))])): + t = self.__get_var_type_an_assign(original_node.body[0].target.attr.value) + if t: t_annot_node_resolved = self.resolve_type_alias(t) t_annot_node = self.__name2annotation(t_annot_node_resolved) if t_annot_node is not None: @@ -1027,6 +1049,11 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, value=original_node.body[0].value, annotation=t_annot_node, equal=original_node.body[0].equal)]) + else: + return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, + whitespace_before_equal=original_node.body[0].equal.whitespace_before, + whitespace_after_equal=original_node.body[0].equal.whitespace_after)], + value=original_node.body[0].value)]) return original_node @@ -1035,6 +1062,8 @@ def leave_Param(self, original_node: cst.Param, updated_node: cst.Param): fn_param_type = self.__get_fn_param_type(original_node.name.value) if fn_param_type is not None: return updated_node.with_changes(annotation=fn_param_type) + else: + return updated_node.with_changes(annotation=None) return original_node diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py index fd5cbb2..ab97d2d 100644 --- a/libsa4py/type_check.py +++ b/libsa4py/type_check.py @@ -7,6 +7,7 @@ from abc import ABC, abstractmethod from os.path import dirname, basename +from typing import Tuple, Union from collections import Counter, namedtuple import toml import os @@ -164,9 +165,9 @@ def _report_errors(self, parsed_result): print(f"Error breaking down: {parsed_result.err_breakdown}.") -def type_check_single_file(f_path: str, tc: TCManager) -> bool: +def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]: no_t_err = tc.heavy_assess(f_path) if no_t_err is not None: - return True if no_t_err.no_type_errs == 0 else False + return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs) else: - return False + return False, None diff --git a/libsa4py/utils.py b/libsa4py/utils.py index c247c40..5519b27 100644 --- a/libsa4py/utils.py +++ b/libsa4py/utils.py @@ -2,6 +2,7 @@ from tqdm import tqdm from joblib import Parallel from os.path import join, isdir +from tempfile import NamedTemporaryFile from pathlib import Path import time import os @@ -113,3 +114,24 @@ def find_repos_list(projects_path: str) -> List[dict]: def mk_dir_not_exist(path: str): if not isdir(path): os.mkdir(path) + + +def create_tmp_file(suffix: str): + """ + It creates a temporary file. + NOTE: the temp file should be deleted manually after creation. + """ + return NamedTemporaryFile(mode="w", delete=False, suffix=suffix) + + +def delete_tmp_file(tmp_f: NamedTemporaryFile): + try: + os.unlink(tmp_f.name) + except TypeError: + print("Couldn't delete ", tmp_f.name) + + +def write_to_tmp_file(tmp_f: NamedTemporaryFile, text: str): + tmp_f.write(text) + #tmp_f.close() + return tmp_f diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json index 39e4227..1727f94 100644 --- a/tests/examples/type_apply_ex.json +++ b/tests/examples/type_apply_ex.json @@ -64,7 +64,7 @@ "name": "Foo", "q_name": "Foo", "variables": { - "foo_v": "", + "foo_v": "str", "foo_p": "pathlib.Path" }, "cls_var_occur": { @@ -134,11 +134,11 @@ 16 ] ], - "params": {}, + "params": {"self": ""}, "ret_exprs": [], "params_occur": {}, "ret_type": "", - "variables": {}, + "variables": {"i": "int"}, "fn_var_occur": {}, "params_descr": {}, "docstring": { diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json new file mode 100644 index 0000000..85bd099 --- /dev/null +++ b/tests/examples/type_apply_typed_ex.json @@ -0,0 +1,223 @@ +{ + "tests/examples": { + "src_files": { + "type_apply_typed.py": { + "untyped_seq": "a = [number] [EOL] l = [ [number] , [number] , [number] ] [EOL] c = [number] [EOL] [EOL] def foo ( x , y ) : [EOL] z = x + y [EOL] return z [EOL] [EOL] class Bar : [EOL] bar_var1 = [string] [EOL] bar_var2 = [number] [EOL] def __init__ ( a , b ) : [EOL] self . a = a [EOL] self . b = b [EOL] def delta ( n ) : [EOL] return [ [number] ] * p [EOL]", + "typed_seq": "$builtins.int$ 0 0 0 $List[int]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 $builtins.float$ 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 $builtins.int$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 $List[float]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0", + "imports": [], + "variables": { + "a": "", + "l": "", + "c": "" + }, + "mod_var_occur": { + "a": [ + [ + "self", + "a", + "builtins", + "int", + "a" + ] + ], + "l": [], + "c": [] + }, + "classes": [ + { + "name": "Bar", + "q_name": "Bar", + "variables": { + "bar_var1": "", + "bar_var2": "" + }, + "cls_var_occur": { + "bar_var1": [], + "bar_var2": [] + }, + "funcs": [ + { + "name": "__init__", + "q_name": "Bar.__init__", + "fn_lc": [ + [ + 12, + 4 + ], + [ + 14, + 18 + ] + ], + "params": { + "a": "", + "b": "" + }, + "ret_exprs": [], + "params_occur": { + "a": [ + [ + "self", + "a", + "builtins", + "int", + "a" + ] + ], + "b": [ + [ + "self", + "b", + "b" + ] + ] + }, + "ret_type": "", + "variables": { + "a": "", + "b": "" + }, + "fn_var_occur": { + "a": [ + [ + "self", + "a", + "builtins", + "int", + "a" + ] + ], + "b": [ + [ + "self", + "b", + "b" + ] + ] + }, + "params_descr": { + "a": "", + "b": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "delta", + "q_name": "Bar.delta", + "fn_lc": [ + [ + 15, + 4 + ], + [ + 16, + 25 + ] + ], + "params": { + "n": "" + }, + "ret_exprs": [ + "return [2.17] * p" + ], + "params_occur": { + "n": [] + }, + "ret_type": "", + "variables": {}, + "fn_var_occur": {}, + "params_descr": { + "n": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + } + ] + } + ], + "funcs": [ + { + "name": "foo", + "q_name": "foo", + "fn_lc": [ + [ + 5, + 0 + ], + [ + 7, + 12 + ] + ], + "params": { + "x": "", + "y": "" + }, + "ret_exprs": [ + "return z" + ], + "params_occur": { + "x": [ + [ + "z", + "builtins", + "int", + "x", + "y" + ] + ], + "y": [ + [ + "z", + "builtins", + "int", + "x", + "y" + ] + ] + }, + "ret_type": "", + "variables": { + "z": "" + }, + "fn_var_occur": { + "z": [ + [ + "z", + "builtins", + "int", + "x", + "y" + ] + ] + }, + "params_descr": { + "x": "", + "y": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + } + ], + "set": null, + "tc": false, + "no_types_annot": { + "U": 0, + "D": 0, + "I": 0 + }, + "type_annot_cove": 0.0 +} + } + } +} \ No newline at end of file diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 5fb08c4..611258d 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -16,7 +16,8 @@ class Foo: class Delta: foo_d = 'Hello, Delta!' foo_p = Path('/home/foo/bar') - def __init__(): + def __init__(self): + self.i = 10 def foo_inner(c, d=lambda a,b: a == b): pass def foo_fn(self, y): @@ -57,7 +58,8 @@ class Foo: class Delta: foo_d = 'Hello, Delta!' foo_p: pathlib.Path = Path('/home/foo/bar') - def __init__(): + def __init__(self): + self.i: int = 10 def foo_inner(c: str, d=lambda a,b: a == b): pass def foo_fn(self, y)-> typing.Dict[builtins.str, builtins.bool]: @@ -79,6 +81,38 @@ def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[bu return v """ +test_file_typed = """a: int = 12 +l: List[int] = [1,2,3] +c = 2.71 +def foo(x: int, y: int) -> int: + z: int = x + y + return z +class Bar: + bar_var1: str = "Hello, Bar!" + bar_var2: float = 3.14 + def __init__(a: int, b): + self.a: int = a + self.b = b + def delta(n: int) -> List[float]: + return [2.17] * p +""" + +test_file_typed_exp = """a = 12 +l = [1,2,3] +c = 2.71 +def foo(x, y): + z = x + y + return z +class Bar: + bar_var1 = "Hello, Bar!" + bar_var2 = 3.14 + def __init__(a, b): + self.a = a + self.b = b + def delta(n): + return [2.17] * p +""" + class TestTypeAnnotatingProjects(unittest.TestCase): """ @@ -92,8 +126,11 @@ def __init__(self, *args, **kwargs): def setUpClass(cls): mk_dir_not_exist('./tmp_ta') write_file('./tmp_ta/type_apply.py', test_file) + write_file('./tmp_ta/type_apply_typed.py', test_file_typed) + # from libsa4py.cst_extractor import Extractor - # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict()) + # # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict()) + # save_json('./tmp_ta/type_apply_typed_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply_typed.py')).to_dict()) def test_type_apply_pipeline(self): ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False) @@ -109,6 +146,15 @@ def test_type_apply_pipeline(self): # The imported types from typing self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7]))) + def test_type_apply_remove_annot(self): + """ + Tests the removal of type annotations if not present in the JSON output + """ + ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False) + ta.process_project('./examples/type_apply_typed_ex.json') + + self.assertEqual(test_file_typed_exp, read_file('./tmp_ta/type_apply_typed.py')) + @classmethod def tearDownClass(cls): shutil.rmtree("./tmp_ta/") From 54ba46b723a3b47620a668dcb43b8f943e71d759 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 1 Jul 2021 10:24:39 +0200 Subject: [PATCH 02/31] Removing type annotations that do not type check by mypy [WIP] - Part 2 --- libsa4py/__main__.py | 13 ++++- libsa4py/cst_pipeline.py | 109 +++++++++++++++++++++------------------ 2 files changed, 72 insertions(+), 50 deletions(-) diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py index fc891cf..16a0b31 100644 --- a/libsa4py/__main__.py +++ b/libsa4py/__main__.py @@ -1,7 +1,7 @@ from argparse import ArgumentParser from multiprocessing import cpu_count from libsa4py.utils import find_repos_list -from libsa4py.cst_pipeline import Pipeline, TypeAnnotatingProjects +from libsa4py.cst_pipeline import Pipeline, TypeAnnotatingProjects, TypeAnnotationsRemoval from libsa4py.merge import merge_projects @@ -16,6 +16,11 @@ def apply_types_projects(args): tap.run(args.j) +def remove_err_type_annotations(args): + tar = TypeAnnotationsRemoval(args.p, args.o, "") + tar.run(args.j) + + def main(): arg_parser = ArgumentParser(description="Light-weight static analysis to extract Python's code representations") @@ -53,6 +58,12 @@ def main(): apply_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects") apply_parser.set_defaults(func=apply_types_projects) + remove_parser = sub_parsers.add_parser('remove') + remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects") + remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects") + remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files") + remove_parser.set_defaults(func=remove_err_type_annotations) + args = arg_parser.parse_args() args.func(args) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index b62f00f..042cb7b 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -59,7 +59,8 @@ def __init__(self, projects_path, output_dir, nlp_transf: bool = True, if self.use_tc: self.tc = MypyManager('mypy', MAX_TC_TIME) - self.split_dataset_files = {f:s for s, f in csv.reader(open(split_files_path, 'r'))} if split_files_path is not None else {} + self.split_dataset_files = {f: s for s, f in + csv.reader(open(split_files_path, 'r'))} if split_files_path is not None else {} # TODO: Fix the logger issue not outputing the logs into the file. # logging.basicConfig(filename=join(self.err_log_dir, "pipeline_errors.log"), level=logging.DEBUG, @@ -83,17 +84,17 @@ def __setup_pipeline_logger(self, log_dir: str): logger_ch = logging.StreamHandler() logger_ch.setLevel(logging.DEBUG) - + logger_fh = logging.FileHandler(filename=log_dir) logger_fh.setLevel(logging.DEBUG) - + logger_formatter = logging.Formatter(fmt='%(asctime)s - %(name)s - %(message)s') logger_ch.setFormatter(logger_formatter) logger_fh.setFormatter(logger_formatter) logger.addHandler(logger_ch) logger.addHandler(logger_fh) - + return logger def get_project_filename(self, project) -> str: @@ -125,14 +126,17 @@ def fn_nlp_transf(fn_d: dict, nlp_prep: NLPreprocessor): fn_d['docstring']['long_descr'] = nlp_prep.process_sentence(fn_d['docstring']['long_descr']) return fn_d - extracted_module['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in extracted_module['variables'].items()} + extracted_module['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in + extracted_module['variables'].items()} extracted_module['mod_var_occur'] = {v: [self.nlp_prep.process_sentence(j) for i in o for j in i] for v, - o in extracted_module['mod_var_occur'].items()} + o in + extracted_module['mod_var_occur'].items()} for c in extracted_module['classes']: c['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in c['variables'].items()} c['cls_var_occur'] = {v: [self.nlp_prep.process_sentence(j) for i in o for j in i] for v, - o in c['cls_var_occur'].items()} + o in + c['cls_var_occur'].items()} c['funcs'] = [fn_nlp_transf(f, self.nlp_prep) for f in c['funcs']] extracted_module['funcs'] = [fn_nlp_transf(f, self.nlp_prep) for f in extracted_module['funcs']] @@ -157,7 +161,8 @@ def process_project(self, i, project): project_files = [(f, str(Path(f).relative_to(Path(self.projects_path).parent))) for f in project_files] project_files = [(f, f_r, self.split_dataset_files[f_r] if f_r in self.split_dataset_files else None) for f, - f_r in project_files] + f_r + in project_files] if len(project_files) != 0: if self.use_pyre: @@ -195,10 +200,10 @@ def process_project(self, i, project): # fail the entire project processing. # TODO: A better workaround would be to have a specialized exception thrown # by the extractor, so that this exception is specialized. - #print(f"Could not process file {filename}") + # print(f"Could not process file {filename}") traceback.print_exc() self.logger.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) - #logging.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + # logging.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) print(f'Saving available type hints for {project_id}...') if self.avl_types_dir is not None: @@ -241,7 +246,8 @@ def run(self, repos_list: List[Dict], jobs, start=0): start_t = time.time() ParallelExecutor(n_jobs=jobs)(total=len(repos_list))( delayed(self.process_project)(i, project) for i, project in enumerate(repos_list, start=start)) - print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t)))) + print( + "Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time() - start_t)))) if self.use_pyre: pyre_kill_all_servers() @@ -295,38 +301,47 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path self.output_path = output_path self.apply_nlp = apply_nlp - def process_file(self, f:str, f_d_repr: dict): + def process_file(self, f: str, f_d_repr: dict): f_read = read_file(join(self.projects_path, f)) - # TODO: The inital type-checking should not be done after adding no. type errors to the representation later on. + # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on. init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f), MypyManager('mypy', MAX_TC_TIME)) if init_tc == False and init_no_tc_err is None: return else: - self.__remove_unchecked_type_annot(f_read, f_d_repr, ) - + tmp_f = create_tmp_file(".py") + f_tc_code, tc_errs = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f) + print(f"F: {Path(f).name} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs}") + # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) + # write_file(join(self.projects_path, f), f_tc_code) + delete_tmp_file(tmp_f) def run(self, jobs: int): - self.merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) + merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) not_tced_src_f: List[Tuple[str, dict]] = [] - for p, p_v in list(self.merged_projects['projects'].items()): + for p, p_v in list(merged_projects['projects'].items()): for f, f_v in p_v['src_files'].items(): if not f_v['tc']: not_tced_src_f.append((f, f_v)) - def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int): - tmp_f = create_tmp_file(".py") + ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d) \ + for f, f_d in not_tced_src_f) + + def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, + f_out_temp: NamedTemporaryFile) -> Tuple[str, int]: + out_f_code: str = "" for m_v, m_v_t in f_d_repr['variables'].items(): if m_v_t != "": print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}") f_d_repr['variables'][m_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['variables'][m_v] = m_v_t @@ -335,11 +350,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if p_t != "": print(f"Type-checking function parameter {p_n} with annotation {p_t}") f_d_repr['funcs'][i]['params'][p_n] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['params'][p_n] = p_t @@ -347,11 +363,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if fn_v_t != "": print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") f_d_repr['funcs'][i]['variables'][fn_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t @@ -360,11 +377,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ org_t = f_d_repr['funcs'][i]['ret_type'] print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}") f_d_repr['funcs'][i]['ret_type'] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['ret_type'] = org_t @@ -374,11 +392,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if c_v_t != "": print(f"Type checking class variable {c_v} with annotation {c_v_t}") f_d_repr['classes'][c_i]['variables'][c_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t @@ -388,11 +407,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if p_t != "": print(f"Type-checking function parameter {p_n} with annotation {p_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t @@ -401,30 +421,31 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if fn_v_t != "": print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t # The return type for class-level functions if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "": org_t = f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] - print(f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}") + print( + f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f) + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code + return f_code, no_tc_err elif no_tc_err < init_no_tc_err: out_f_code = f_code + init_no_tc_err = no_tc_err elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - # apply_inferred_types(src_f_read, src_f_ext, src_f_o_path) - delete_tmp_file(tmp_f) - return out_f_code + return out_f_code, init_no_tc_err def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile): f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr, @@ -432,13 +453,3 @@ def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: Named write_to_tmp_file(out_f, f_t_applied.code) tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME)) return tc, no_tc_err, f_t_applied.code - - - - - - - - - - From dfe612e2605ab972631d2e333132c3ca240822d1 Mon Sep 17 00:00:00 2001 From: Amir Mir Date: Thu, 1 Jul 2021 11:49:37 +0200 Subject: [PATCH 03/31] Removing type annotations that do not type check by mypy [WIP] - Part 3 --- libsa4py/cst_pipeline.py | 6 +++--- libsa4py/cst_transformers.py | 17 +++++++++++++---- libsa4py/type_check.py | 12 ++++++++---- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 042cb7b..bad9367 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -318,7 +318,7 @@ def process_file(self, f: str, f_d_repr: dict): delete_tmp_file(tmp_f) def run(self, jobs: int): - merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) + merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json")) not_tced_src_f: List[Tuple[str, dict]] = [] for p, p_v in list(merged_projects['projects'].items()): for f, f_v in p_v['src_files'].items(): @@ -406,7 +406,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ for p_n, p_t in fn["params"].items(): if p_t != "": print(f"Type-checking function parameter {p_n} with annotation {p_t}") - f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: return f_code, no_tc_err @@ -420,7 +420,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") - f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: return f_code, no_tc_err diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 23f18b8..a5d4558 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -1011,6 +1011,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, updated_node: cst.SimpleStatementLine): # Untyped variables + t = None if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( target=match.DoNotCare())])])): if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( @@ -1050,10 +1051,18 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, annotation=t_annot_node, equal=original_node.body[0].equal)]) else: - return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, - whitespace_before_equal=original_node.body[0].equal.whitespace_before, - whitespace_after_equal=original_node.body[0].equal.whitespace_after)], - value=original_node.body[0].value)]) + try: + return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, + whitespace_before_equal=original_node.body[0].equal.whitespace_before, + whitespace_after_equal=original_node.body[0].equal.whitespace_after)], + value=original_node.body[0].value)]) + except AttributeError: + print("AT", original_node.body[0]) + return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, + whitespace_before_equal=original_node.body[0].equal.whitespace_before, + whitespace_after_equal=original_node.body[0].equal.whitespace_after)], + value=original_node.body[0].value)]) + return original_node diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py index ab97d2d..abcfe28 100644 --- a/libsa4py/type_check.py +++ b/libsa4py/type_check.py @@ -166,8 +166,12 @@ def _report_errors(self, parsed_result): def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]: - no_t_err = tc.heavy_assess(f_path) - if no_t_err is not None: - return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs) - else: + try: + no_t_err = tc.heavy_assess(f_path) + if no_t_err is not None: + return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs) + else: + return False, None + except IndexError: + print(f"f: {f_path} - No output from Mypy!") return False, None From d2a41f222c7ab64098bcb32645d26f9ed7ddde19 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 1 Jul 2021 13:56:42 +0200 Subject: [PATCH 04/31] Fixed AttributeError when removing annotations for unintialized vars --- libsa4py/cst_transformers.py | 18 ++++++------------ tests/examples/type_apply_typed_ex.json | 3 ++- tests/test_type_apply.py | 2 ++ 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index a5d4558..7576f65 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -1034,7 +1034,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))] ) # Typed variables - elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare())])): + elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare(), + value=match.MatchIfTrue(lambda v: v is not None))])): if match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])): t = self.__get_var_type_an_assign(original_node.body[0].target.value) elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Attribute(value=match.Name(value=match.DoNotCare()), @@ -1051,17 +1052,10 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, annotation=t_annot_node, equal=original_node.body[0].equal)]) else: - try: - return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, - whitespace_before_equal=original_node.body[0].equal.whitespace_before, - whitespace_after_equal=original_node.body[0].equal.whitespace_after)], - value=original_node.body[0].value)]) - except AttributeError: - print("AT", original_node.body[0]) - return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, - whitespace_before_equal=original_node.body[0].equal.whitespace_before, - whitespace_after_equal=original_node.body[0].equal.whitespace_after)], - value=original_node.body[0].value)]) + return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, + whitespace_before_equal=original_node.body[0].equal.whitespace_before, + whitespace_after_equal=original_node.body[0].equal.whitespace_after)], + value=original_node.body[0].value)]) return original_node diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json index 85bd099..f40b525 100644 --- a/tests/examples/type_apply_typed_ex.json +++ b/tests/examples/type_apply_typed_ex.json @@ -8,7 +8,8 @@ "variables": { "a": "", "l": "", - "c": "" + "c": "", + "h": "" }, "mod_var_occur": { "a": [ diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 611258d..427a0ab 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -84,6 +84,7 @@ def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[bu test_file_typed = """a: int = 12 l: List[int] = [1,2,3] c = 2.71 +h: dict def foo(x: int, y: int) -> int: z: int = x + y return z @@ -100,6 +101,7 @@ def delta(n: int) -> List[float]: test_file_typed_exp = """a = 12 l = [1,2,3] c = 2.71 +h: dict def foo(x, y): z = x + y return z From a1703082f40f0941a0d2f8878753ad77a529caf6 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 1 Jul 2021 14:17:59 +0200 Subject: [PATCH 05/31] Removing type annotations that do not type check by mypy [WIP] - Part 4 --- libsa4py/cst_pipeline.py | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index bad9367..74f511c 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -311,8 +311,9 @@ def process_file(self, f: str, f_d_repr: dict): return else: tmp_f = create_tmp_file(".py") - f_tc_code, tc_errs = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f) - print(f"F: {Path(f).name} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs}") + f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, + tmp_f) + print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r}") # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) # write_file(join(self.projects_path, f), f_tc_code) delete_tmp_file(tmp_f) @@ -329,19 +330,22 @@ def run(self, jobs: int): for f, f_d in not_tced_src_f) def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, - f_out_temp: NamedTemporaryFile) -> Tuple[str, int]: + f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: out_f_code: str = "" + type_annots_removed: List[str] = [] for m_v, m_v_t in f_d_repr['variables'].items(): if m_v_t != "": print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}") f_d_repr['variables'][m_v] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(m_v_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(m_v_t) elif no_tc_err == init_no_tc_err: f_d_repr['variables'][m_v] = m_v_t @@ -352,10 +356,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['funcs'][i]['params'][p_n] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(p_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(p_t) elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['params'][p_n] = p_t @@ -365,10 +371,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['funcs'][i]['variables'][fn_v] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(fn_v_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(fn_v_t) elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t @@ -379,10 +387,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['funcs'][i]['ret_type'] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(org_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(org_t) elif no_tc_err == init_no_tc_err: f_d_repr['funcs'][i]['ret_type'] = org_t @@ -394,10 +404,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['classes'][c_i]['variables'][c_v] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(c_v_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(c_v_t) elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t @@ -409,10 +421,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(p_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(p_t) elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t @@ -423,10 +437,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(fn_v_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(fn_v_t) elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t @@ -438,14 +454,16 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = "" tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if tc: - return f_code, no_tc_err + type_annots_removed.append(org_t) + return f_code, no_tc_err, type_annots_removed elif no_tc_err < init_no_tc_err: out_f_code = f_code init_no_tc_err = no_tc_err + type_annots_removed.append(org_t) elif no_tc_err == init_no_tc_err: f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - return out_f_code, init_no_tc_err + return out_f_code, init_no_tc_err, type_annots_removed def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile): f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr, From f7b286d911a4cc0fc6a7239403f6ea0d29e31123 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 1 Jul 2021 14:43:09 +0200 Subject: [PATCH 06/31] Removing type annotations that do not type check by mypy [WIP] - Part 4 --- libsa4py/cst_pipeline.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 74f511c..0d652f2 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -10,6 +10,7 @@ from pathlib import Path from datetime import timedelta from joblib import delayed +from multiprocessing import Manager from dpu_utils.utils.dataloading import load_jsonl_gz from libsa4py.cst_extractor import Extractor from libsa4py.cst_transformers import TypeApplier @@ -301,7 +302,7 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path self.output_path = output_path self.apply_nlp = apply_nlp - def process_file(self, f: str, f_d_repr: dict): + def process_file(self, f: str, f_d_repr: dict, tc_res: dict): f_read = read_file(join(self.projects_path, f)) # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on. init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f), @@ -313,7 +314,10 @@ def process_file(self, f: str, f_d_repr: dict): tmp_f = create_tmp_file(".py") f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f) - print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r}") + print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ + total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") + tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, + "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) # write_file(join(self.projects_path, f), f_tc_code) delete_tmp_file(tmp_f) @@ -326,9 +330,13 @@ def run(self, jobs: int): if not f_v['tc']: not_tced_src_f.append((f, f_v)) - ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d) \ + manager = Manager() + tc_res = manager.dict() + ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \ for f, f_d in not_tced_src_f) + save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res) + def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: From 715430a8752d7d5018aba6ca8d9c5d78a24b1825 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 1 Jul 2021 15:32:26 +0200 Subject: [PATCH 07/31] Removing type annotations that do not type check by mypy [WIP] - Part 5 --- libsa4py/cst_pipeline.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 0d652f2..ac33ba7 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -311,16 +311,18 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict): if init_tc == False and init_no_tc_err is None: return else: - tmp_f = create_tmp_file(".py") - f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, - tmp_f) - print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ - total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") - tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, - "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} - # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) - # write_file(join(self.projects_path, f), f_tc_code) - delete_tmp_file(tmp_f) + # Only files with type annotations + if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: + tmp_f = create_tmp_file(".py") + f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, + tmp_f) + print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ + total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") + tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, + "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} + # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) + # write_file(join(self.projects_path, f), f_tc_code) + delete_tmp_file(tmp_f) def run(self, jobs: int): merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json")) @@ -335,7 +337,7 @@ def run(self, jobs: int): ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \ for f, f_d in not_tced_src_f) - save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res) + save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy()) def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: From 8f46016dfff3a8b629dc49582ccd27f7b01aea9b Mon Sep 17 00:00:00 2001 From: Amir Mir Date: Fri, 2 Jul 2021 23:04:21 +0200 Subject: [PATCH 08/31] Removing type annotations that do not type check by mypy [WIP] - Part 6 --- libsa4py/cst_pipeline.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 0d652f2..c2edcc3 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -314,28 +314,28 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict): tmp_f = create_tmp_file(".py") f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f) - print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ - total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") + print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) - # write_file(join(self.projects_path, f), f_tc_code) + write_file(join(self.projects_path, f), f_tc_code) delete_tmp_file(tmp_f) def run(self, jobs: int): - merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json")) + merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) not_tced_src_f: List[Tuple[str, dict]] = [] for p, p_v in list(merged_projects['projects'].items()): for f, f_v in p_v['src_files'].items(): if not f_v['tc']: not_tced_src_f.append((f, f_v)) + #not_tced_src_f = not_tced_src_f[:250] manager = Manager() tc_res = manager.dict() ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \ for f, f_d in not_tced_src_f) - save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res) + save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy()) def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: From c63897306b1cc5a39791135732526ec5b6050b8b Mon Sep 17 00:00:00 2001 From: Amir Mir Date: Mon, 12 Jul 2021 11:19:04 +0200 Subject: [PATCH 09/31] Improvements to the TypeAnnotationsRemoval pipeline --- libsa4py/cst_pipeline.py | 190 ++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 71 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 2b1ef84..692387e 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -313,8 +313,8 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict): # Only files with type annotations if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: try: - f_read = read_file(join(self.projects_path, f)) tmp_f = create_tmp_file(".py") + f_read = read_file(join(self.projects_path, f)) f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], tmp_f) print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ @@ -337,81 +337,113 @@ def run(self, jobs: int): if not f_v['tc'][0] and f_v['tc'] != [False, None]: not_tced_src_f.append((f, f_v)) - print("L:", len(not_tced_src_f)) - #not_tced_src_f = not_tced_src_f[:250] + del merged_projects + # not_tced_src_f = not_tced_src_f[:250] + # print("L:", len(not_tced_src_f)) manager = Manager() tc_res = manager.dict() ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \ for f, f_d in not_tced_src_f) - save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy()) + save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy()) def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: - out_f_code: str = "" type_annots_removed: List[str] = [] + + def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): + tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + if no_tc_err is not None: + if tc: + type_annots_removed.append(org_gt) + elif no_tc_err < init_no_tc_err: + out_f_code = f_code + init_no_tc_err = no_tc_err + type_annots_removed.append(org_gt) + elif no_tc_err == init_no_tc_err: + org_gt_d = org_gt + + return tc, no_tc_err, f_code + + out_f_code: str = "" for m_v, m_v_t in f_d_repr['variables'].items(): if m_v_t != "": print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}") f_d_repr['variables'][m_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(m_v_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(m_v_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['variables'][m_v] = m_v_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, m_v_t, + f_d_repr['variables'][m_v]) if tc: - type_annots_removed.append(m_v_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(m_v_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['variables'][m_v] = m_v_t for i, fn in enumerate(f_d_repr['funcs']): for p_n, p_t in fn['params'].items(): if p_t != "": print(f"Type-checking function parameter {p_n} with annotation {p_t}") f_d_repr['funcs'][i]['params'][p_n] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) - if tc: - type_annots_removed.append(p_t) - return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(p_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['funcs'][i]['params'][p_n] = p_t + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(p_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(p_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['funcs'][i]['params'][p_n] = p_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t, + f_d_repr['funcs'][i]['params'][p_n]) + if tc: + return f_code, no_tc_err, type_annots_removed for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") f_d_repr['funcs'][i]['variables'][fn_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(fn_v_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(fn_v_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t, + f_d_repr['funcs'][i]['variables'][fn_v]) if tc: - type_annots_removed.append(fn_v_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(fn_v_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t # The return type for module-level functions if f_d_repr['funcs'][i]['ret_type'] != "": org_t = f_d_repr['funcs'][i]['ret_type'] print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}") f_d_repr['funcs'][i]['ret_type'] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(org_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(org_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['funcs'][i]['ret_type'] = org_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t, + f_d_repr['funcs'][i]['ret_type']) if tc: - type_annots_removed.append(org_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(org_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['funcs'][i]['ret_type'] = org_t # The type of class-level vars for c_i, c in enumerate(f_d_repr['classes']): @@ -419,16 +451,20 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if c_v_t != "": print(f"Type checking class variable {c_v} with annotation {c_v_t}") f_d_repr['classes'][c_i]['variables'][c_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(c_v_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(c_v_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, c_v_t, + f_d_repr['classes'][c_i]['variables'][c_v]) if tc: - type_annots_removed.append(c_v_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(c_v_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t # The type of arguments for class-level functions for fn_i, fn in enumerate(c['funcs']): @@ -436,32 +472,40 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ if p_t != "": print(f"Type-checking function parameter {p_n} with annotation {p_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(p_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(p_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) if tc: - type_annots_removed.append(p_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(p_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t # The type of local variables for class-level functions for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(fn_v_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(fn_v_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) if tc: - type_annots_removed.append(fn_v_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(fn_v_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t # The return type for class-level functions if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "": @@ -469,16 +513,20 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ print( f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}") f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = "" - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + # if tc: + # type_annots_removed.append(org_t) + # return f_code, no_tc_err, type_annots_removed + # elif no_tc_err < init_no_tc_err: + # out_f_code = f_code + # init_no_tc_err = no_tc_err + # type_annots_removed.append(org_t) + # elif no_tc_err == init_no_tc_err: + # f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t + tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) if tc: - type_annots_removed.append(org_t) return f_code, no_tc_err, type_annots_removed - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err - type_annots_removed.append(org_t) - elif no_tc_err == init_no_tc_err: - f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t return out_f_code, init_no_tc_err, type_annots_removed From 7aa76bdd7b359659c64fd0879afc57a667a5e82a Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 12 Jul 2021 11:32:04 +0200 Subject: [PATCH 10/31] Improve type annotation removal code --- libsa4py/cst_pipeline.py | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 692387e..e015697 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -352,16 +352,16 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ type_annots_removed: List[str] = [] - def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): + def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) if no_tc_err is not None: if tc: type_annots_removed.append(org_gt) - elif no_tc_err < init_no_tc_err: - out_f_code = f_code - init_no_tc_err = no_tc_err + elif no_tc_err < curr_no_tc_err: + curr_f_code = f_code + curr_no_tc_err = no_tc_err type_annots_removed.append(org_gt) - elif no_tc_err == init_no_tc_err: + elif no_tc_err == curr_no_tc_err: org_gt_d = org_gt return tc, no_tc_err, f_code @@ -381,8 +381,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(m_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['variables'][m_v] = m_v_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, m_v_t, - f_d_repr['variables'][m_v]) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t, + f_d_repr['variables'][m_v]) if tc: return f_code, no_tc_err, type_annots_removed @@ -401,10 +401,10 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['params'][p_n] = p_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t, - f_d_repr['funcs'][i]['params'][p_n]) - if tc: - return f_code, no_tc_err, type_annots_removed + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, + f_d_repr['funcs'][i]['params'][p_n]) + if tc: + return f_code, no_tc_err, type_annots_removed for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": @@ -420,8 +420,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t, - f_d_repr['funcs'][i]['variables'][fn_v]) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, + f_d_repr['funcs'][i]['variables'][fn_v]) if tc: return f_code, no_tc_err, type_annots_removed @@ -440,8 +440,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['ret_type'] = org_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t, - f_d_repr['funcs'][i]['ret_type']) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, + f_d_repr['funcs'][i]['ret_type']) if tc: return f_code, no_tc_err, type_annots_removed @@ -461,8 +461,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(c_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, c_v_t, - f_d_repr['classes'][c_i]['variables'][c_v]) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t, + f_d_repr['classes'][c_i]['variables'][c_v]) if tc: return f_code, no_tc_err, type_annots_removed @@ -482,8 +482,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) if tc: return f_code, no_tc_err, type_annots_removed @@ -502,8 +502,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) if tc: return f_code, no_tc_err, type_annots_removed @@ -523,8 +523,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) + tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, + f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) if tc: return f_code, no_tc_err, type_annots_removed From 17f8bccad42539c7a2d68e4a8a73933869505b78 Mon Sep 17 00:00:00 2001 From: Amir Mir Date: Mon, 19 Jul 2021 10:22:20 +0200 Subject: [PATCH 11/31] Improve TypeApplier by matching functions by line and column no. & matching class names' QN using IN operator --- libsa4py/cst_transformers.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 7576f65..d531fd9 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -859,7 +859,8 @@ class TypeApplier(cst.CSTTransformer): Specifically, it applies the type of arguments, return types, and variables' type. """ - METADATA_DEPENDENCIES = (cst.metadata.ScopeProvider, cst.metadata.QualifiedNameProvider) + METADATA_DEPENDENCIES = (cst.metadata.ScopeProvider, cst.metadata.QualifiedNameProvider, + cst.metadata.PositionProvider) def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True): self.f_processed_dict = f_processeed_dict @@ -884,8 +885,9 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict: fns = self.f_processed_dict['funcs'] for fn in fns: - if fn['q_name'] == self.__get_qualified_name(f_node.name) and \ - set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): + # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \ + # set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): + if fn['fn_lc'] == self.__get_line_column_no(f_node): return fn def __get_fn_param_type(self, param_name: str): @@ -899,7 +901,8 @@ def __get_fn_param_type(self, param_name: str): def __get_cls(self, cls: cst.ClassDef) -> dict: for c in self.f_processed_dict['classes']: - if c['q_name'] == self.__get_qualified_name(cls.name): + q = self.__get_qualified_name(cls.name) + if c['q_name'] in self.__get_qualified_name(cls.name): return c def __get_fn_vars(self, var_name: str) -> dict: @@ -1130,6 +1133,10 @@ def __get_qualified_name(self, node) -> Optional[str]: q_name = list(self.get_metadata(cst.metadata.QualifiedNameProvider, node)) return q_name[0].name if len(q_name) != 0 else None + def __get_line_column_no(self, node) -> List[List[int]]: + lc = self.get_metadata(cst.metadata.PositionProvider, node) + return [[lc.start.line, lc.start.column], [lc.end.line, lc.end.column]] + def resolve_type_alias(self, t: str): type_aliases = {'^{}$|^Dict$|(?<=.*)Dict\[\](?<=.*)|(?<=.*)Dict\[Any, *?Any\](?=.*)|^Dict\[unknown, *Any\]$': 'dict', '^Set$|(?<=.*)Set\[\](?<=.*)|^Set\[Any\]$': 'set', From 221c14c5c97eb1c90fb65bdd1eee9308ea531e37 Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 19 Jul 2021 11:23:22 +0200 Subject: [PATCH 12/31] Fix unit tests for TypeApplier when matching functions based on line and column no. --- libsa4py/cst_transformers.py | 6 +- tests/examples/type_apply_ex.json | 909 ++++++++++++++---------- tests/examples/type_apply_typed_ex.json | 8 +- tests/test_type_apply.py | 10 +- 4 files changed, 551 insertions(+), 382 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index d531fd9..53e5863 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -887,7 +887,7 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict: for fn in fns: # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \ # set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): - if fn['fn_lc'] == self.__get_line_column_no(f_node): + if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node): return fn def __get_fn_param_type(self, param_name: str): @@ -1133,9 +1133,9 @@ def __get_qualified_name(self, node) -> Optional[str]: q_name = list(self.get_metadata(cst.metadata.QualifiedNameProvider, node)) return q_name[0].name if len(q_name) != 0 else None - def __get_line_column_no(self, node) -> List[List[int]]: + def __get_line_column_no(self, node) -> Tuple[int, int]: lc = self.get_metadata(cst.metadata.PositionProvider, node) - return [[lc.start.line, lc.start.column], [lc.end.line, lc.end.column]] + return lc.start.line, lc.end.line def resolve_type_alias(self, t: str): type_aliases = {'^{}$|^Dict$|(?<=.*)Dict\[\](?<=.*)|(?<=.*)Dict\[Any, *?Any\](?=.*)|^Dict\[unknown, *Any\]$': 'dict', diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json index 1727f94..5976215 100644 --- a/tests/examples/type_apply_ex.json +++ b/tests/examples/type_apply_ex.json @@ -1,417 +1,584 @@ { - "tests/examples": { - "src_files": { - "type_apply.py": { - "untyped_seq": "from typing import Tuple , Dict , List , Literal [EOL] from collections import defaultdict [EOL] import pandas [EOL] import pathlib [EOL] import builtins [EOL] import collections [EOL] import typing [EOL] from pathlib import Path [EOL] x = [number] [EOL] l = [ ( [number] , [number] ) ] [EOL] c = defaultdict ( int ) [EOL] df = pd . DataFrame ( [ [number] , [number] ] ) [EOL] dff = pd . DataFrame ( [ [number] , [number] ] ) [EOL] lit = [string] [EOL] class Foo : [EOL] foo_v = [string] [EOL] class Delta : [EOL] foo_d = [string] [EOL] foo_p = Path ( [string] ) [EOL] def __init__ ( ) : [EOL] def foo_inner ( c , d ) : [EOL] pass [EOL] def foo_fn ( self , y ) : [EOL] def foo_inner ( a , b , c , d ) : [EOL] pass [EOL] d = { [string] : True } [EOL] return d [EOL] @ event . getter def get_e ( self ) : [EOL] return Foo . foo_v [EOL] @ event . setter def get_e ( self , y ) : [EOL] Foo . foo_v = y [EOL] return Foo . foo_v [EOL] foo_v = [string] [EOL] def Bar ( x = [ [string] , [string] ] ) : [EOL] v = x [EOL] l = lambda e : e + [number] [EOL] return v [EOL]", - "typed_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 $typing.List[typing.Tuple[builtins.int,builtins.int]]$ 0 0 0 0 0 0 0 0 0 $collections.defaultdict$ 0 0 0 0 0 0 $pandas.DataFrame$ 0 0 0 0 0 0 0 0 0 0 0 0 $typing.List[pandas.arrays.PandasArray]$ 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Literal$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $pathlib.Path$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 0 0 0 0 0 0 0 0 $typing.List[builtins.str]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 $typing.List[typing.Tuple[builtins.int,builtins.int]]$ 0 0 0 0 0 0 0 0 0 0 0", - "imports": [ - "Tuple", - "Dict", - "List", - "Literal", - "defaultdict", - "pandas", - "pathlib", - "builtins", - "collections", - "typing", - "Path" - ], - "variables": { - "x": "builtins.int", - "l": "typing.List[typing.Tuple[builtins.int, builtins.int]]", - "c": "collections.defaultdict", - "df": "pandas.DataFrame", - "dff": "typing.List[pandas.arrays.PandasArray]", - "lit": "typing.Literal" - }, - "mod_var_occur": { - "x": [ - [ - "v", - "typing", - "List", - "builtins", - "str", - "x" - ] - ], - "l": [ - [ - "l", - "e", - "e" - ] - ], - "c": [], - "df": [], - "dff": [], - "lit": [] - }, - "classes": [ - { - "name": "Delta", - "q_name": "Foo.Delta", - "variables": { - "foo_d": "" - }, - "cls_var_occur": { - "foo_d": [] - }, - "funcs": [] - }, - { - "name": "Foo", - "q_name": "Foo", - "variables": { - "foo_v": "str", - "foo_p": "pathlib.Path" - }, - "cls_var_occur": { - "foo_v": [ - [ - "Foo", - "foo_v" - ], - [ - "Foo", - "foo_v", - "y" - ], - [ - "Foo", - "foo_v" - ] + "tests/examples": { + "src_files": { + "type_apply.py": { + "untyped_seq": "", + "typed_seq": "", + "imports": [ + "pathlib", + "Path" ], - "foo_p": [] - }, - "funcs": [ - { - "name": "foo_inner", - "q_name": "Foo.__init__..foo_inner", - "fn_lc": [ + "variables": { + "x": "builtins.int", + "l": "typing.List[typing.Tuple[builtins.int, builtins.int]]", + "c": "collections.defaultdict", + "df": "pandas.DataFrame", + "dff": "typing.List[pandas.arrays.PandasArray]", + "lit": "typing.Literal" + }, + "mod_var_occur": { + "x": [ [ - 21, - 8 - ], + "v", + "typing", + "List", + "builtins", + "str", + "x" + ] + ], + "l": [ [ - 22, - 16 + "l", + "e", + "e" ] ], - "params": { - "c": "str", - "d": "" - }, - "ret_exprs": [], - "params_occur": { - "c": [], - "d": [] - }, - "ret_type": "", - "variables": {}, - "fn_var_occur": {}, - "params_descr": { - "c": "", - "d": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null - } + "c": [], + "df": [], + "dff": [], + "lit": [] }, - { - "name": "__init__", - "q_name": "Foo.__init__", - "fn_lc": [ + "mod_var_ln": { + "x": [ [ - 20, - 4 + 2, + 0 ], [ - 22, - 16 + 2, + 1 ] ], - "params": {"self": ""}, - "ret_exprs": [], - "params_occur": {}, - "ret_type": "", - "variables": {"i": "int"}, - "fn_var_occur": {}, - "params_descr": {}, - "docstring": { - "func": null, - "ret": null, - "long_descr": null - } - }, - { - "name": "foo_inner", - "q_name": "Foo.foo_fn..foo_inner", - "fn_lc": [ + "l": [ [ - 24, - 8 + 3, + 0 ], [ - 25, - 16 + 3, + 1 ] ], - "params": { - "a": "", - "b": "", - "c": "", - "d": "", - "args": "", - "kwargs": "" - }, - "ret_exprs": [], - "params_occur": { - "a": [], - "b": [], - "c": [], - "d": [] - }, - "ret_type": "", - "variables": {}, - "fn_var_occur": {}, - "params_descr": { - "a": "", - "b": "", - "c": "", - "d": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null - } - }, - { - "name": "foo_fn", - "q_name": "Foo.foo_fn", - "fn_lc": [ + "c": [ [ - 23, - 4 + 4, + 0 ], [ - 27, - 16 + 4, + 1 ] ], - "params": { - "self": "", - "y": "" - }, - "ret_exprs": [ - "return d" - ], - "params_occur": { - "self": [], - "y": [] - }, - "ret_type": "typing.Dict[builtins.str, builtins.bool]", - "variables": { - "d": "typing.Dict[builtins.str, builtins.bool]" - }, - "fn_var_occur": { - "d": [ - [ - "d", - "typing", - "Dict", - "builtins", - "str", - "builtins", - "bool", - "True" - ] - ] - }, - "params_descr": { - "self": "", - "y": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null - } - }, - { - "name": "get_e", - "q_name": "Foo.get_e", - "fn_lc": [ + "df": [ [ - 29, - 4 + 5, + 0 ], [ - 30, - 24 + 5, + 2 ] ], - "params": { - "self": "" - }, - "ret_exprs": [ - "return Foo.foo_v" - ], - "params_occur": { - "self": [] - }, - "ret_type": "", - "variables": {}, - "fn_var_occur": {}, - "params_descr": { - "self": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null - } - }, - { - "name": "get_e", - "q_name": "Foo.get_e", - "fn_lc": [ + "dff": [ [ - 32, - 4 + 6, + 0 ], [ - 34, - 24 + 6, + 3 ] ], - "params": { - "self": "", - "y": "builtins.str" + "lit": [ + [ + 7, + 0 + ], + [ + 7, + 3 + ] + ] + }, + "classes": [ + { + "name": "Delta", + "q_name": "Foo.Delta", + "variables": { + "foo_d": "" + }, + "cls_var_occur": { + "foo_d": [] + }, + "cls_var_ln": { + "foo_d": [ + [ + 11, + 8 + ], + [ + 11, + 13 + ] + ] + }, + "funcs": [] }, - "ret_exprs": [ - "return Foo.foo_v" - ], - "params_occur": { - "self": [], - "y": [ - [ - "Foo", - "foo_v", - "y" + { + "name": "Foo", + "q_name": "Foo", + "variables": { + "foo_v": "str", + "foo_p": "pathlib.Path" + }, + "cls_var_occur": { + "foo_v": [ + [ + "Foo", + "foo_v" + ], + [ + "Foo", + "foo_v", + "y" + ], + [ + "Foo", + "foo_v" + ] + ], + "foo_p": [] + }, + "cls_var_ln": { + "foo_v": [ + [ + 29, + 4 + ], + [ + 29, + 9 + ] + ], + "foo_p": [ + [ + 12, + 4 + ], + [ + 12, + 9 + ] ] + }, + "funcs": [ + { + "name": "foo_inner", + "q_name": "Foo.__init__..foo_inner", + "fn_lc": [ + [ + 15, + 8 + ], + [ + 16, + 16 + ] + ], + "params": { + "c": "builtins.str", + "d": "" + }, + "ret_exprs": [], + "params_occur": { + "c": [], + "d": [] + }, + "ret_type": "", + "variables": {}, + "fn_var_occur": {}, + "fn_var_ln": {}, + "params_descr": { + "c": "", + "d": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "__init__", + "q_name": "Foo.__init__", + "fn_lc": [ + [ + 13, + 4 + ], + [ + 16, + 16 + ] + ], + "params": { + "self": "" + }, + "ret_exprs": [], + "params_occur": { + "self": [] + }, + "ret_type": "", + "variables": { + "i": "builtins.int" + }, + "fn_var_occur": { + "i": [] + }, + "fn_var_ln": { + "i": [ + [ + 14, + 8 + ], + [ + 14, + 14 + ] + ] + }, + "params_descr": { + "self": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "foo_inner", + "q_name": "Foo.foo_fn..foo_inner", + "fn_lc": [ + [ + 18, + 8 + ], + [ + 19, + 16 + ] + ], + "params": { + "a": "", + "b": "", + "c": "", + "d": "", + "args": "", + "kwargs": "" + }, + "ret_exprs": [], + "params_occur": { + "a": [], + "b": [], + "c": [], + "d": [], + "args": [], + "kwargs": [] + }, + "ret_type": "", + "variables": {}, + "fn_var_occur": {}, + "fn_var_ln": {}, + "params_descr": { + "a": "", + "b": "", + "c": "", + "d": "", + "args": "", + "kwargs": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "foo_fn", + "q_name": "Foo.foo_fn", + "fn_lc": [ + [ + 17, + 4 + ], + [ + 21, + 16 + ] + ], + "params": { + "self": "", + "y": "" + }, + "ret_exprs": [ + "return d" + ], + "params_occur": { + "self": [], + "y": [] + }, + "ret_type": "typing.Dict[builtins.str, builtins.bool]", + "variables": { + "d": "typing.Dict[builtins.str, builtins.bool]" + }, + "fn_var_occur": { + "d": [ + [ + "d", + "typing", + "Dict", + "builtins", + "str", + "builtins", + "bool", + "True" + ] + ] + }, + "fn_var_ln": { + "d": [ + [ + 20, + 8 + ], + [ + 20, + 9 + ] + ] + }, + "params_descr": { + "self": "", + "y": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "get_e", + "q_name": "Foo.get_e", + "fn_lc": [ + [ + 23, + 4 + ], + [ + 24, + 24 + ] + ], + "params": { + "self": "" + }, + "ret_exprs": [ + "return Foo.foo_v" + ], + "params_occur": { + "self": [] + }, + "ret_type": "", + "variables": {}, + "fn_var_occur": {}, + "fn_var_ln": {}, + "params_descr": { + "self": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "get_e", + "q_name": "Foo.get_e", + "fn_lc": [ + [ + 26, + 4 + ], + [ + 28, + 24 + ] + ], + "params": { + "self": "", + "y": "builtins.str" + }, + "ret_exprs": [ + "return Foo.foo_v" + ], + "params_occur": { + "self": [], + "y": [ + [ + "Foo", + "foo_v", + "y" + ] + ] + }, + "ret_type": "", + "variables": { + "foo_v": "" + }, + "fn_var_occur": { + "foo_v": [ + [ + "Foo", + "foo_v", + "y" + ], + [ + "Foo", + "foo_v" + ] + ] + }, + "fn_var_ln": { + "foo_v": [ + [ + 27, + 8 + ], + [ + 27, + 17 + ] + ] + }, + "params_descr": { + "self": "", + "y": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + } ] - }, - "ret_type": "", - "variables": { - "foo_v": "" - }, - "fn_var_occur": { - "foo_v": [ + } + ], + "funcs": [ + { + "name": "Bar", + "q_name": "Bar", + "fn_lc": [ [ - "Foo", - "foo_v", - "y" + 30, + 0 ], [ - "Foo", - "foo_v" + 33, + 12 ] - ] - }, - "params_descr": { - "self": "", - "y": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null + ], + "params": { + "x": "typing.List[builtins.str]", + "c": "" + }, + "ret_exprs": [ + "return v" + ], + "params_occur": { + "x": [ + [ + "v", + "typing", + "List", + "builtins", + "str", + "x" + ] + ], + "c": [] + }, + "ret_type": "typing.List[builtins.str]", + "variables": { + "v": "typing.List[builtins.str]", + "l": "" + }, + "fn_var_occur": { + "v": [ + [ + "v", + "typing", + "List", + "builtins", + "str", + "x" + ] + ], + "l": [ + [ + "l", + "e", + "e" + ] + ] + }, + "fn_var_ln": { + "v": [ + [ + 31, + 4 + ], + [ + 31, + 5 + ] + ], + "l": [ + [ + 32, + 4 + ], + [ + 32, + 5 + ] + ] + }, + "params_descr": { + "x": "", + "c": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } } - } - ] - } - ], - "funcs": [ - { - "name": "Bar", - "q_name": "Bar", - "fn_lc": [ - [ - 36, - 0 ], - [ - 39, - 12 - ] - ], - "params": { - "x": "typing.List[builtins.str]", - "c": "" - }, - "ret_exprs": [ - "return v" - ], - "params_occur": { - "x": [ - [ - "v", - "typing", - "List", - "builtins", - "str", - "x" - ] - ] - }, - "ret_type": "typing.List[builtins.str]", - "variables": { - "v": "typing.List[builtins.str]", - "l": "" - }, - "fn_var_occur": { - "v": [ - [ - "v", - "typing", - "List", - "builtins", - "str", - "x" - ] + "set": null, + "tc": [ + false, + null ], - "l": [ - [ - "l", - "e", - "e" - ] - ] - }, - "params_descr": { - "x": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null + "no_types_annot": { + "U": 15, + "D": 14, + "I": 0 + }, + "type_annot_cove": 0.48 } } - ], - "set": null, - "tc": false, - "no_types_annot": { - "U": 12, - "D": 13, - "I": 0 - }, - "type_annot_cove": 0.52 -} } - } } \ No newline at end of file diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json index f40b525..374b4a0 100644 --- a/tests/examples/type_apply_typed_ex.json +++ b/tests/examples/type_apply_typed_ex.json @@ -42,11 +42,11 @@ "q_name": "Bar.__init__", "fn_lc": [ [ - 12, + 11, 4 ], [ - 14, + 13, 18 ] ], @@ -111,11 +111,11 @@ "q_name": "Bar.delta", "fn_lc": [ [ - 15, + 14, 4 ], [ - 16, + 15, 25 ] ], diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 427a0ab..60c777c 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -59,8 +59,8 @@ class Delta: foo_d = 'Hello, Delta!' foo_p: pathlib.Path = Path('/home/foo/bar') def __init__(self): - self.i: int = 10 - def foo_inner(c: str, d=lambda a,b: a == b): + self.i: builtins.int = 10 + def foo_inner(c: builtins.str, d=lambda a,b: a == b): pass def foo_fn(self, y)-> typing.Dict[builtins.str, builtins.bool]: def foo_inner(a, b, c, d, *args, **kwargs): @@ -131,8 +131,10 @@ def setUpClass(cls): write_file('./tmp_ta/type_apply_typed.py', test_file_typed) # from libsa4py.cst_extractor import Extractor - # # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict()) - # save_json('./tmp_ta/type_apply_typed_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply_typed.py')).to_dict()) + # save_json('./tmp_ta/type_apply_ex.json', {"tests/examples": {"src_files": {"type_apply.py": + # Extractor.extract(read_file('./tmp_ta/type_apply.py'), include_seq2seq=False).to_dict()}}}) + # save_json('./tmp_ta/type_apply_typed_ex.json', {"tests/examples": {"src_files": {"type_apply_typed.py": + # Extractor.extract(read_file('./tmp_ta/type_apply_typed.py'), include_seq2seq=False).to_dict()}}}) def test_type_apply_pipeline(self): ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False) From 57eacf62be2bb5b0e9f06ed8153963bdae14c73d Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 19 Jul 2021 14:25:09 +0200 Subject: [PATCH 13/31] Remove superfluous assignment line from TypeApplier --- libsa4py/cst_transformers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 53e5863..ecb7fb6 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -901,7 +901,6 @@ def __get_fn_param_type(self, param_name: str): def __get_cls(self, cls: cst.ClassDef) -> dict: for c in self.f_processed_dict['classes']: - q = self.__get_qualified_name(cls.name) if c['q_name'] in self.__get_qualified_name(cls.name): return c From f0aa00ead2c3b7d4ea6012f062d0548216c72b98 Mon Sep 17 00:00:00 2001 From: mir-am Date: Wed, 21 Jul 2021 17:19:59 +0200 Subject: [PATCH 14/31] A workaround for a very rare case where the class' QN doesn't match when applying types --- libsa4py/cst_transformers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index ecb7fb6..78df5a8 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -901,7 +901,10 @@ def __get_fn_param_type(self, param_name: str): def __get_cls(self, cls: cst.ClassDef) -> dict: for c in self.f_processed_dict['classes']: - if c['q_name'] in self.__get_qualified_name(cls.name): + q = self.__get_qualified_name(cls.name) + if c['q_name'] == q: + return c + elif c['q_name'].split(".")[-1] == q.split(".")[-1]: return c def __get_fn_vars(self, var_name: str) -> dict: From 19428fd60fdd8ad1c5790cf9a8b2637049939efc Mon Sep 17 00:00:00 2001 From: mir-am Date: Wed, 21 Jul 2021 17:25:04 +0200 Subject: [PATCH 15/31] When applying types, first match functions' QN & signature first, if no match, then check line no. --- libsa4py/cst_transformers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 78df5a8..a98cc56 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -885,9 +885,10 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict: fns = self.f_processed_dict['funcs'] for fn in fns: - # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \ - # set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): - if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node): + if fn['q_name'] in self.__get_qualified_name(f_node.name) and \ + set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): + return fn + elif (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node): return fn def __get_fn_param_type(self, param_name: str): From 65130df582ea211201838422598b26dc4019c845 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 22 Jul 2021 13:45:31 +0200 Subject: [PATCH 16/31] Count total no. of added types in TypeApplier and its pipeline --- libsa4py/cst_pipeline.py | 7 ++++++- libsa4py/cst_transformers.py | 5 +++++ tests/test_type_apply.py | 4 +++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index e015697..a180ae1 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -267,6 +267,7 @@ def __init__(self, projects_path: str, output_path: str, apply_nlp: bool = True) def process_project(self, proj_json_path: str): proj_json = load_json(proj_json_path) + total_added_types = 0 for p in proj_json.keys(): for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()): f_read = read_file(join(self.projects_path, f)) @@ -274,8 +275,10 @@ def process_project(self, proj_json_path: str): try: f_parsed = cst.parse_module(f_read) try: - f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(TypeApplier(f_d, self.apply_nlp)) + ta = TypeApplier(f_d, self.apply_nlp) + f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta) write_file(join(self.projects_path, f), f_parsed.code) + total_added_types += ta.no_applied_types except KeyError as ke: print(f"A variable not found | project {proj_json_path} | file {f}", ke) traceback.print_exc() @@ -285,6 +288,8 @@ def process_project(self, proj_json_path: str): except cst._exceptions.ParserSyntaxError as pse: print(f"Can't parsed file {f} in project {proj_json_path}", pse) + return total_added_types + def run(self, jobs: int): proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json') proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index a98cc56..7d9a963 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -872,6 +872,7 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True): self.lambda_d = 0 self.all_applied_types = set() + self.no_applied_types = 0 if apply_nlp: self.nlp_p = NLPreprocessor().process_identifier @@ -898,6 +899,7 @@ def __get_fn_param_type(self, param_name: str): fn_param_type = self.__name2annotation(fn_param_type_resolved) if fn_param_type is not None: self.all_applied_types.add((fn_param_type_resolved, fn_param_type)) + self.no_applied_types += 1 return fn_param_type def __get_cls(self, cls: cst.ClassDef) -> dict: @@ -1000,6 +1002,7 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu fn_ret_type = self.__name2annotation(fn_ret_type_resolved) if fn_ret_type is not None: self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type)) + self.no_applied_types += 1 return updated_node.with_changes(returns=fn_ret_type) else: return updated_node.with_changes(returns=None) @@ -1032,6 +1035,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, t_annot_node = self.__name2annotation(t_annot_node_resolved) if t_annot_node is not None: self.all_applied_types.add((t_annot_node_resolved, t_annot_node)) + self.no_applied_types += 1 return updated_node.with_changes(body=[cst.AnnAssign( target=original_node.body[0].targets[0].target, value=original_node.body[0].value, @@ -1052,6 +1056,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, t_annot_node = self.__name2annotation(t_annot_node_resolved) if t_annot_node is not None: self.all_applied_types.add((t_annot_node_resolved, t_annot_node)) + self.no_applied_types += 1 return updated_node.with_changes(body=[cst.AnnAssign( target=original_node.body[0].target, value=original_node.body[0].value, diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 60c777c..704066e 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -138,7 +138,7 @@ def setUpClass(cls): def test_type_apply_pipeline(self): ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False) - ta.process_project('./examples/type_apply_ex.json') + total_no_added_types = ta.process_project('./examples/type_apply_ex.json') exp_split = test_file_exp.splitlines() out_split = read_file('./tmp_ta/type_apply.py').splitlines() @@ -147,6 +147,8 @@ def test_type_apply_pipeline(self): out = """{}""".format("\n".join(out_split[7:])) self.assertEqual(exp, out) + self.assertEqual(total_no_added_types, 16) + # The imported types from typing self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7]))) From 61667719fadfa9143bd681d6a1aedcf44d9197a3 Mon Sep 17 00:00:00 2001 From: mir-am Date: Thu, 22 Jul 2021 14:51:53 +0200 Subject: [PATCH 17/31] Improvements to the TypeAnnotatingProjects pipeline --- libsa4py/cst_pipeline.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index a180ae1..1c24da5 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -270,8 +270,9 @@ def process_project(self, proj_json_path: str): total_added_types = 0 for p in proj_json.keys(): for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()): - f_read = read_file(join(self.projects_path, f)) - if len(f_read) != 0: + print(f"Adding types to file {f} from project {proj_json_path}") + if f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] > 0: + f_read = read_file(join(self.projects_path, f)) try: f_parsed = cst.parse_module(f_read) try: @@ -279,6 +280,7 @@ def process_project(self, proj_json_path: str): f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta) write_file(join(self.projects_path, f), f_parsed.code) total_added_types += ta.no_applied_types + print(f"Applied {ta.no_applied_types} types to file {f} from project {proj_json_path}") except KeyError as ke: print(f"A variable not found | project {proj_json_path} | file {f}", ke) traceback.print_exc() @@ -293,7 +295,11 @@ def process_project(self, proj_json_path: str): def run(self, jobs: int): proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json') proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True) - ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons) + start_t = time.time() + proj_type_added = ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) \ + for p_j in proj_jsons) + print(f"Finished applying types in {str(timedelta(seconds=time.time() - start_t))}") + print(f"{sum(proj_type_added)} types applied to the whole dataset") class TypeAnnotationsRemoval: From 95d3c7e8be30ed57831cb78c01595ca558c1b25c Mon Sep 17 00:00:00 2001 From: mir-am Date: Fri, 23 Jul 2021 13:32:33 +0200 Subject: [PATCH 18/31] Fix test failure for types removal --- libsa4py/cst_transformers.py | 1 + tests/examples/type_apply_typed_ex.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 7d9a963..ea537eb 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -853,6 +853,7 @@ def leave_SubscriptElement(self, original_node, updated_node): return updated_node +# TODO: Write two separate CSTTransformers for applying and removing type annotations class TypeApplier(cst.CSTTransformer): """ It applies (inferred) type annotations to a source code file. diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json index 374b4a0..388d001 100644 --- a/tests/examples/type_apply_typed_ex.json +++ b/tests/examples/type_apply_typed_ex.json @@ -214,7 +214,7 @@ "tc": false, "no_types_annot": { "U": 0, - "D": 0, + "D": 1, "I": 0 }, "type_annot_cove": 0.0 From 7927d45edf6960d55ff48643ee7760ed9983e26c Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 26 Jul 2021 10:38:09 +0200 Subject: [PATCH 19/31] Improvements to TypeApplier: (1) Better matching of function, classes and variables (2) counting failed applied types --- libsa4py/cst_transformers.py | 47 ++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index ea537eb..20261db 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -874,6 +874,7 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True): self.all_applied_types = set() self.no_applied_types = 0 + self.no_failed_applied_types = 0 if apply_nlp: self.nlp_p = NLPreprocessor().process_identifier @@ -886,11 +887,15 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict: else: fns = self.f_processed_dict['funcs'] + qn = self.__get_qualified_name(f_node.name) + fn_params = set(self.__get_fn_params(f_node.params)) + fn_lc = self.__get_line_column_no(f_node) for fn in fns: - if fn['q_name'] in self.__get_qualified_name(f_node.name) and \ - set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)): + if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == fn_lc: return fn - elif (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node): + + for fn in fns: + if fn['q_name'] == qn and set(list(fn['params'].keys())) == fn_params: return fn def __get_fn_param_type(self, param_name: str): @@ -902,13 +907,18 @@ def __get_fn_param_type(self, param_name: str): self.all_applied_types.add((fn_param_type_resolved, fn_param_type)) self.no_applied_types += 1 return fn_param_type + else: + self.no_failed_applied_types += 1 def __get_cls(self, cls: cst.ClassDef) -> dict: + cls_lc = self.__get_line_column_no(cls) + cls_qn = self.__get_qualified_name(cls.name) for c in self.f_processed_dict['classes']: - q = self.__get_qualified_name(cls.name) - if c['q_name'] == q: + if (c['cls_lc'][0][0], c['cls_lc'][1][0]) == cls_lc: return c - elif c['q_name'].split(".")[-1] == q.split(".")[-1]: + + for c in self.f_processed_dict['classes']: + if c['q_name'] == cls_qn: return c def __get_fn_vars(self, var_name: str) -> dict: @@ -932,24 +942,27 @@ def __get_cls_vars(self, var_name: str) -> dict: def __get_mod_vars(self): return self.f_processed_dict['variables'] - def __get_var_type_assign_t(self, var_name: str): + def __get_var_type_assign_t(self, var_name: str, var_node): t: str = None + var_line_no = self.__get_line_column_no(var_node) if len(self.cls_visited) != 0: if len(self.fn_visited) != 0: # A class method's variable - if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count: + if self.fn_visited[-1][0]['fn_var_ln'][var_name][0][0] == var_line_no[0]: t = self.__get_fn_vars(self.nlp_p(var_name)) else: # A class variable - if self.cls_visited[-1][1][var_name] == self.last_visited_assign_t_count: + if self.cls_visited[-1][0]["cls_var_ln"][var_name][0][0] == var_line_no[0]: t = self.__get_cls_vars(self.nlp_p(var_name)) elif len(self.fn_visited) != 0: # A module function's variable - if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count: + #if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count: + if self.fn_visited[-1][0]['fn_var_ln'][var_name][0][0] == var_line_no[0]: t = self.__get_fn_vars(self.nlp_p(var_name)) else: # A module's variables - t = self.__get_mod_vars()[self.nlp_p(var_name)] + if self.f_processed_dict['mod_var_ln'][var_name][0][0] == var_line_no[0]: + t = self.__get_mod_vars()[self.nlp_p(var_name)] return t def __get_var_type_an_assign(self, var_name: str): @@ -1005,6 +1018,8 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type)) self.no_applied_types += 1 return updated_node.with_changes(returns=fn_ret_type) + else: + self.no_failed_applied_types += 1 else: return updated_node.with_changes(returns=None) @@ -1026,10 +1041,12 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, target=match.DoNotCare())])])): if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( target=match.Name(value=match.DoNotCare()))])])): - t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value) + t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value, + original_node.body[0].targets[0].target) elif match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget( target=match.Attribute(value=match.Name(value=match.DoNotCare()), attr=match.Name(value=match.DoNotCare())))])])): - t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value) + t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value, + original_node.body[0].targets[0].target) if t is not None: t_annot_node_resolved = self.resolve_type_alias(t) @@ -1044,6 +1061,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, equal=cst.AssignEqual(whitespace_after=original_node.body[0].targets[0].whitespace_after_equal, whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))] ) + else: + self.no_failed_applied_types += 1 # Typed variables elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare(), value=match.MatchIfTrue(lambda v: v is not None))])): @@ -1063,6 +1082,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine, value=original_node.body[0].value, annotation=t_annot_node, equal=original_node.body[0].equal)]) + else: + self.no_failed_applied_types += 1 else: return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target, whitespace_before_equal=original_node.body[0].equal.whitespace_before, From d41fea3615b8b24ba22429e05254d5ccfb48733e Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 26 Jul 2021 10:40:14 +0200 Subject: [PATCH 20/31] Improvments to the pipeline of TypeApplier: (1) Dry run (2) Assertion for no. of applied types --- libsa4py/__main__.py | 6 +++++- libsa4py/cst_pipeline.py | 22 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py index 16a0b31..6e4e1de 100644 --- a/libsa4py/__main__.py +++ b/libsa4py/__main__.py @@ -12,7 +12,7 @@ def process_projects(args): def apply_types_projects(args): - tap = TypeAnnotatingProjects(args.p, args.o) + tap = TypeAnnotatingProjects(args.p, args.o, args.dry_run) tap.run(args.j) @@ -56,6 +56,10 @@ def main(): apply_parser.add_argument("--p", required=True, type=str, help="Path to Python projects") apply_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects") apply_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects") + apply_parser.add_argument("--d", dest='dry_run', action='store_true', + help="Dry run does not apply types to the dataset's files") + + apply_parser.set_defaults(dry_run=False) apply_parser.set_defaults(func=apply_types_projects) remove_parser = sub_parsers.add_parser('remove') diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 1c24da5..cc7963e 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -1,3 +1,4 @@ +from libsa4py.cst_visitor import TypeAnnotationCounter import os import traceback import random @@ -260,17 +261,21 @@ class TypeAnnotatingProjects: It applies the inferred type annotations to the input dataset """ - def __init__(self, projects_path: str, output_path: str, apply_nlp: bool = True): + def __init__(self, projects_path: str, output_path: str, dry_run: bool = False, + apply_nlp: bool = True): self.projects_path = projects_path self.output_path = output_path + self.dry_run = dry_run self.apply_nlp = apply_nlp def process_project(self, proj_json_path: str): proj_json = load_json(proj_json_path) total_added_types = 0 + total_no_types = 0 for p in proj_json.keys(): for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()): print(f"Adding types to file {f} from project {proj_json_path}") + total_no_types += f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] if f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] > 0: f_read = read_file(join(self.projects_path, f)) try: @@ -278,19 +283,23 @@ def process_project(self, proj_json_path: str): try: ta = TypeApplier(f_d, self.apply_nlp) f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta) - write_file(join(self.projects_path, f), f_parsed.code) + if not self.dry_run: + write_file(join(self.projects_path, f), f_parsed.code) total_added_types += ta.no_applied_types print(f"Applied {ta.no_applied_types} types to file {f} from project {proj_json_path}") + assert f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] <= self.__get_no_applied_types(f_parsed.code) + ta.no_failed_applied_types except KeyError as ke: print(f"A variable not found | project {proj_json_path} | file {f}", ke) traceback.print_exc() except TypeError as te: print(f"Project {proj_json_path} | file {f}", te) traceback.print_exc() + except AssertionError as te: + print(f"[AssertionError] Project {proj_json_path} | file {f}", te) except cst._exceptions.ParserSyntaxError as pse: print(f"Can't parsed file {f} in project {proj_json_path}", pse) - return total_added_types + return total_added_types, total_no_types def run(self, jobs: int): proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json') @@ -299,8 +308,13 @@ def run(self, jobs: int): proj_type_added = ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) \ for p_j in proj_jsons) print(f"Finished applying types in {str(timedelta(seconds=time.time() - start_t))}") - print(f"{sum(proj_type_added)} types applied to the whole dataset") + print(f"{sum([a for a, t in proj_type_added]):,}/{sum([t for a, t in proj_type_added]):,} types applied to the whole dataset") + def __get_no_applied_types(self, code: str) -> int: + f_applied_p = cst.parse_module(code) + tac = TypeAnnotationCounter() + f_applied_p.visit(tac) + return tac.total_no_type_annot class TypeAnnotationsRemoval: """ From 3111e8445540843d0b3ebebf691f152c2f565a7e Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 26 Jul 2021 11:12:24 +0200 Subject: [PATCH 21/31] Fix test failure for the TypeAppier --- tests/examples/type_apply_ex.json | 20 + tests/examples/type_apply_typed_ex.json | 505 ++++++++++++++---------- tests/test_type_apply.py | 8 +- 3 files changed, 327 insertions(+), 206 deletions(-) diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json index 5976215..817ffe7 100644 --- a/tests/examples/type_apply_ex.json +++ b/tests/examples/type_apply_ex.json @@ -105,6 +105,16 @@ { "name": "Delta", "q_name": "Foo.Delta", + "cls_lc": [ + [ + 10, + 4 + ], + [ + 11, + 31 + ] + ], "variables": { "foo_d": "" }, @@ -128,6 +138,16 @@ { "name": "Foo", "q_name": "Foo", + "cls_lc": [ + [ + 8, + 0 + ], + [ + 29, + 16 + ] + ], "variables": { "foo_v": "str", "foo_p": "pathlib.Path" diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json index 388d001..4ce162c 100644 --- a/tests/examples/type_apply_typed_ex.json +++ b/tests/examples/type_apply_typed_ex.json @@ -1,224 +1,325 @@ { - "tests/examples": { - "src_files": { - "type_apply_typed.py": { - "untyped_seq": "a = [number] [EOL] l = [ [number] , [number] , [number] ] [EOL] c = [number] [EOL] [EOL] def foo ( x , y ) : [EOL] z = x + y [EOL] return z [EOL] [EOL] class Bar : [EOL] bar_var1 = [string] [EOL] bar_var2 = [number] [EOL] def __init__ ( a , b ) : [EOL] self . a = a [EOL] self . b = b [EOL] def delta ( n ) : [EOL] return [ [number] ] * p [EOL]", - "typed_seq": "$builtins.int$ 0 0 0 $List[int]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 $builtins.float$ 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 $builtins.int$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 $List[float]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0", - "imports": [], - "variables": { - "a": "", - "l": "", - "c": "", - "h": "" - }, - "mod_var_occur": { - "a": [ - [ - "self", - "a", - "builtins", - "int", - "a" - ] - ], - "l": [], - "c": [] - }, - "classes": [ - { - "name": "Bar", - "q_name": "Bar", - "variables": { - "bar_var1": "", - "bar_var2": "" - }, - "cls_var_occur": { - "bar_var1": [], - "bar_var2": [] - }, - "funcs": [ - { - "name": "__init__", - "q_name": "Bar.__init__", - "fn_lc": [ + "tests/examples": { + "src_files": { + "type_apply_typed.py": { + "untyped_seq": "", + "typed_seq": "", + "imports": [], + "variables": { + "a": "", + "l": "", + "c": "", + "h": "builtins.dict" + }, + "mod_var_occur": { + "a": [ + [ + "self", + "a", + "a" + ] + ], + "l": [], + "c": [], + "h": [] + }, + "mod_var_ln": { + "a": [ [ - 11, - 4 + 1, + 0 ], [ - 13, - 18 + 1, + 1 ] ], - "params": { - "a": "", - "b": "" - }, - "ret_exprs": [], - "params_occur": { - "a": [ - [ - "self", - "a", - "builtins", - "int", - "a" - ] + "l": [ + [ + 2, + 0 ], - "b": [ - [ - "self", - "b", - "b" - ] + [ + 2, + 1 ] - }, - "ret_type": "", - "variables": { - "a": "", - "b": "" - }, - "fn_var_occur": { - "a": [ + ], + "c": [ + [ + 3, + 0 + ], + [ + 3, + 1 + ] + ], + "h": [ + [ + 4, + 0 + ], + [ + 4, + 1 + ] + ] + }, + "classes": [ + { + "name": "Bar", + "q_name": "Bar", + "cls_lc": [ + [ + 8, + 0 + ], [ - "self", - "a", - "builtins", - "int", - "a" + 15, + 25 ] ], - "b": [ - [ - "self", - "b", - "b" + "variables": { + "bar_var1": "", + "bar_var2": "" + }, + "cls_var_occur": { + "bar_var1": [], + "bar_var2": [] + }, + "cls_var_ln": { + "bar_var1": [ + [ + 9, + 4 + ], + [ + 9, + 12 + ] + ], + "bar_var2": [ + [ + 10, + 4 + ], + [ + 10, + 12 + ] ] + }, + "funcs": [ + { + "name": "__init__", + "q_name": "Bar.__init__", + "fn_lc": [ + [ + 11, + 4 + ], + [ + 13, + 18 + ] + ], + "params": { + "a": "", + "b": "" + }, + "ret_exprs": [], + "params_occur": { + "a": [ + [ + "self", + "a", + "a" + ] + ], + "b": [ + [ + "self", + "b", + "b" + ] + ] + }, + "ret_type": "", + "variables": { + "a": "", + "b": "" + }, + "fn_var_occur": { + "a": [ + [ + "self", + "a", + "a" + ] + ], + "b": [ + [ + "self", + "b", + "b" + ] + ] + }, + "fn_var_ln": { + "a": [ + [ + 12, + 8 + ], + [ + 12, + 14 + ] + ], + "b": [ + [ + 13, + 8 + ], + [ + 13, + 14 + ] + ] + }, + "params_descr": { + "a": "", + "b": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + }, + { + "name": "delta", + "q_name": "Bar.delta", + "fn_lc": [ + [ + 14, + 4 + ], + [ + 15, + 25 + ] + ], + "params": { + "n": "" + }, + "ret_exprs": [ + "return [2.17] * p" + ], + "params_occur": { + "n": [] + }, + "ret_type": "", + "variables": {}, + "fn_var_occur": {}, + "fn_var_ln": {}, + "params_descr": { + "n": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } + } ] - }, - "params_descr": { - "a": "", - "b": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null } - }, - { - "name": "delta", - "q_name": "Bar.delta", - "fn_lc": [ - [ - 14, - 4 + ], + "funcs": [ + { + "name": "foo", + "q_name": "foo", + "fn_lc": [ + [ + 5, + 0 + ], + [ + 7, + 12 + ] ], - [ - 15, - 25 - ] - ], - "params": { - "n": "" - }, - "ret_exprs": [ - "return [2.17] * p" - ], - "params_occur": { - "n": [] - }, - "ret_type": "", - "variables": {}, - "fn_var_occur": {}, - "params_descr": { - "n": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null + "params": { + "x": "", + "y": "" + }, + "ret_exprs": [ + "return z" + ], + "params_occur": { + "x": [ + [ + "z", + "x", + "y" + ] + ], + "y": [ + [ + "z", + "x", + "y" + ] + ] + }, + "ret_type": "", + "variables": { + "z": "" + }, + "fn_var_occur": { + "z": [ + [ + "z", + "x", + "y" + ] + ] + }, + "fn_var_ln": { + "z": [ + [ + 6, + 4 + ], + [ + 6, + 5 + ] + ] + }, + "params_descr": { + "x": "", + "y": "" + }, + "docstring": { + "func": null, + "ret": null, + "long_descr": null + } } - } - ] - } - ], - "funcs": [ - { - "name": "foo", - "q_name": "foo", - "fn_lc": [ - [ - 5, - 0 ], - [ - 7, - 12 - ] - ], - "params": { - "x": "", - "y": "" - }, - "ret_exprs": [ - "return z" - ], - "params_occur": { - "x": [ - [ - "z", - "builtins", - "int", - "x", - "y" - ] + "set": null, + "tc": [ + false, + null ], - "y": [ - [ - "z", - "builtins", - "int", - "x", - "y" - ] - ] - }, - "ret_type": "", - "variables": { - "z": "" - }, - "fn_var_occur": { - "z": [ - [ - "z", - "builtins", - "int", - "x", - "y" - ] - ] - }, - "params_descr": { - "x": "", - "y": "" - }, - "docstring": { - "func": null, - "ret": null, - "long_descr": null + "no_types_annot": { + "U": 14, + "D": 1, + "I": 0 + }, + "type_annot_cove": 0.07 } } - ], - "set": null, - "tc": false, - "no_types_annot": { - "U": 0, - "D": 1, - "I": 0 - }, - "type_annot_cove": 0.0 -} } - } } \ No newline at end of file diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 704066e..6b23b63 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -12,7 +12,7 @@ dff = pd.DataFrame([1,2]) lit = "Hello!" class Foo: - foo_v: str = 'Hello, Foo!' + foo_v = 'Hello, Foo!' class Delta: foo_d = 'Hello, Delta!' foo_p = Path('/home/foo/bar') @@ -54,7 +54,7 @@ def Bar(x=['apple', 'orange'], *, c): dff: typing.List[pandas.arrays.PandasArray] = pd.DataFrame([1,2]) lit: typing.Literal = "Hello!" class Foo: - foo_v: str = 'Hello, Foo!' + foo_v = 'Hello, Foo!' class Delta: foo_d = 'Hello, Delta!' foo_p: pathlib.Path = Path('/home/foo/bar') @@ -74,7 +74,7 @@ def get_e(self): def get_e(self, y: builtins.str): Foo.foo_v = y return Foo.foo_v - foo_v = "No" + foo_v: str = "No" def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[builtins.str]: v: typing.List[builtins.str] = x l = lambda e: e+1 @@ -147,7 +147,7 @@ def test_type_apply_pipeline(self): out = """{}""".format("\n".join(out_split[7:])) self.assertEqual(exp, out) - self.assertEqual(total_no_added_types, 16) + self.assertEqual(total_no_added_types[0], 16) # The imported types from typing self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7]))) From 8efb7b1d41f41054884f1a478e29a4562c2a322e Mon Sep 17 00:00:00 2001 From: mir-am Date: Wed, 28 Jul 2021 11:05:58 +0200 Subject: [PATCH 22/31] Improvements to the TypeRemoval pipeline : (1) Dry run (2) better multi-processing (3) Max try --- libsa4py/__main__.py | 6 +- libsa4py/cst_pipeline.py | 139 +++++++++++++++++++++++++++------------ 2 files changed, 102 insertions(+), 43 deletions(-) diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py index 6e4e1de..8dc283e 100644 --- a/libsa4py/__main__.py +++ b/libsa4py/__main__.py @@ -17,7 +17,7 @@ def apply_types_projects(args): def remove_err_type_annotations(args): - tar = TypeAnnotationsRemoval(args.p, args.o, "") + tar = TypeAnnotationsRemoval(args.p, args.o, "", args.l, args.dry_run) tar.run(args.j) @@ -66,6 +66,10 @@ def main(): remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects") remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects") remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files") + remove_parser.add_argument("--l", required=False, type=int, help="Number of projects to process") + remove_parser.add_argument("--d", dest='dry_run', action='store_true', + help="Dry run does not remove types from the dataset's files") + remove_parser.set_defaults(dry_run=False) remove_parser.set_defaults(func=remove_err_type_annotations) args = arg_parser.parse_args() diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index cc7963e..ffa870d 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -4,6 +4,7 @@ import random import csv import time +import queue from typing import List, Dict, Tuple from os.path import join @@ -11,7 +12,8 @@ from pathlib import Path from datetime import timedelta from joblib import delayed -from multiprocessing import Manager +from multiprocessing import Manager, Process, Queue, managers +from multiprocessing.queues import Queue from dpu_utils.utils.dataloading import load_jsonl_gz from libsa4py.cst_extractor import Extractor from libsa4py.cst_transformers import TypeApplier @@ -321,13 +323,17 @@ class TypeAnnotationsRemoval: Removes type annotations that cannot be type-checked by mypy """ - def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, apply_nlp: bool = True): + def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None, + dry_run: bool = False, apply_nlp: bool = True): self.projects_path = projects_path self.processed_projects_path = processed_projects_path self.output_path = output_path + self.no_projects_limit = no_projects_limit + self.dry_run = dry_run self.apply_nlp = apply_nlp - def process_file(self, f: str, f_d_repr: dict, tc_res: dict): + #def process_file(self, f: str, f_d_repr: dict, tc_res: dict): + def process_file(self, q: Queue, is_f_loader_done, tc_res: dict): # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on. # init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f), # MypyManager('mypy', MAX_TC_TIME)) @@ -336,49 +342,97 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict): # return # else: # Only files with type annotations - if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: + while not is_f_loader_done.value or q.qsize() != 0: try: - tmp_f = create_tmp_file(".py") - f_read = read_file(join(self.projects_path, f)) - f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], - tmp_f) - print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ - total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}") - tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, - "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} - # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) - write_file(join(self.projects_path, f), f_tc_code) - except Exception as e: - print(f"f: {f} | e: {e}") - traceback.print_exc() - finally: - delete_tmp_file(tmp_f) + f, f_d_repr = q.get(True, 1) + if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: + try: + tmp_f = create_tmp_file(".py") + f_read = read_file(join(self.projects_path, f)) + f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], + tmp_f) + print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ + total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}") + tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, + "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} + # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) + if not self.dry_run and tc_errs == 0: + write_file(join(self.projects_path, f), f_tc_code) + except Exception as e: + print(f"f: {f} | e: {e}") + traceback.print_exc() + finally: + delete_tmp_file(tmp_f) + except queue.Empty as e: + print(f"Worker {os.getpid()} finished! Queue's empty!") + print(f"File loader working {is_f_loader_done.value} and queue size {q.qsize()}") def run(self, jobs: int): - merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) - not_tced_src_f: List[Tuple[str, dict]] = [] - for p, p_v in list(merged_projects['projects'].items()): - for f, f_v in p_v['src_files'].items(): - if not f_v['tc'][0] and f_v['tc'] != [False, None]: - not_tced_src_f.append((f, f_v)) - - del merged_projects - # not_tced_src_f = not_tced_src_f[:250] - # print("L:", len(not_tced_src_f)) manager = Manager() + q = manager.Queue() + is_f_loader_done = manager.Value('i', False) + + file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done)) + file_loader.start() + #file_loader.join() + + print("File loader started!") + + # merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json")) + # not_tced_src_f: List[Tuple[str, dict]] = [] + # for p, p_v in list(merged_projects['projects'].items()): + # for f, f_v in p_v['src_files'].items(): + # if not f_v['tc'][0] and f_v['tc'] != [False, None]: + # not_tced_src_f.append((f, f_v)) + + # del merged_projects + # # not_tced_src_f = not_tced_src_f[:250] + # # print("L:", len(not_tced_src_f)) + # manager = Manager() + time.sleep(5) + start_t = time.time() tc_res = manager.dict() - ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \ - for f, f_d in not_tced_src_f) - + file_processors = [] + for j in range(jobs): + p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res)) + p.daemon = True + file_processors.append(p) + p.start() + + + for p in file_processors: + p.join() + file_loader.join() + # ParallelExecutor(n_jobs=jobs)(total=0)(delayed(self.process_file)(f, f_d, tc_res) \ + # for f, f_d in not_tced_src_f) + print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}") save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy()) - + + def __load_projects_files(self, q: Queue, is_done): + proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json') + proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons + f_loaded = 0 + for p_j in proj_jsons: + proj_json = load_json(p_j) + for _, p_v in proj_json.items(): + for f, f_v in p_v['src_files'].items(): + if not f_v['tc'][0] and f_v['tc'] != [False, None] and f_v['tc'][1] <= 100: + q.put((f, f_v)) + f_loaded += 1 + #print("Adding files to Queue...") + is_done.value = True + print(f"Loaded {f_loaded} Python files") + def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: type_annots_removed: List[str] = [] + no_try = 0 + MAX_TRY = 10 def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + nonlocal no_try if no_tc_err is not None: if tc: type_annots_removed.append(org_gt) @@ -386,8 +440,9 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): curr_f_code = f_code curr_no_tc_err = no_tc_err type_annots_removed.append(org_gt) - elif no_tc_err == curr_no_tc_err: + else: org_gt_d = org_gt + no_try += 1 return tc, no_tc_err, f_code @@ -408,7 +463,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['variables'][m_v] = m_v_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t, f_d_repr['variables'][m_v]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed for i, fn in enumerate(f_d_repr['funcs']): @@ -428,7 +483,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['funcs'][i]['params'][p_n] = p_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, f_d_repr['funcs'][i]['params'][p_n]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed for fn_v, fn_v_t in fn['variables'].items(): @@ -447,7 +502,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, f_d_repr['funcs'][i]['variables'][fn_v]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed # The return type for module-level functions @@ -467,7 +522,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['funcs'][i]['ret_type'] = org_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, f_d_repr['funcs'][i]['ret_type']) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed # The type of class-level vars @@ -488,7 +543,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t, f_d_repr['classes'][c_i]['variables'][c_v]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed # The type of arguments for class-level functions @@ -509,7 +564,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed # The type of local variables for class-level functions @@ -529,7 +584,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed # The return type for class-level functions @@ -550,7 +605,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) - if tc: + if tc or no_try > MAX_TRY: return f_code, no_tc_err, type_annots_removed return out_f_code, init_no_tc_err, type_annots_removed From 1fc8156571cbfa5a4580a0485231e441dd14972c Mon Sep 17 00:00:00 2001 From: mir-am Date: Wed, 28 Jul 2021 11:08:58 +0200 Subject: [PATCH 23/31] Run mypy with the file's abs. path, which may improve TC in some cases --- libsa4py/type_check.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py index abcfe28..40a9f19 100644 --- a/libsa4py/type_check.py +++ b/libsa4py/type_check.py @@ -68,10 +68,12 @@ def _build_tc_cmd(self, fpath): def _type_check(self, fpath): try: - cwd = os.getcwd() - os.chdir(dirname(fpath)) + # cwd = os.getcwd() + # os.chdir(dirname(fpath)) + # Runs mypy with the file's absolute path + # It may improve detection of type erorrs in some cases! result = subprocess.run( - self._build_tc_cmd(basename(fpath)), + self._build_tc_cmd(fpath), # basename(fpath) capture_output=True, text=True, timeout=self._timeout, @@ -81,8 +83,8 @@ def _type_check(self, fpath): return retcode, outlines except subprocess.TimeoutExpired: raise TypeCheckingTooLong - finally: - os.chdir(cwd) + # finally: + # os.chdir(cwd) @abstractmethod def _check_tc_outcome(self, returncode, outlines): From a89338c04084da339ae67d18703347b28d8cc3f7 Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 2 Aug 2021 14:29:34 +0200 Subject: [PATCH 24/31] Fixing re-importing names when applying types --- libsa4py/cst_transformers.py | 17 ++++- tests/examples/type_apply_ex.json | 105 +++++++++++++++--------------- tests/test_type_apply.py | 5 +- 3 files changed, 70 insertions(+), 57 deletions(-) diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py index 20261db..af078a7 100644 --- a/libsa4py/cst_transformers.py +++ b/libsa4py/cst_transformers.py @@ -876,6 +876,8 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True): self.no_applied_types = 0 self.no_failed_applied_types = 0 + self.imported_names: List[str] = [] + if apply_nlp: self.nlp_p = NLPreprocessor().process_identifier else: @@ -1116,20 +1118,29 @@ def visit_AssignTarget(self, node: cst.AssignTarget): def leave_Module(self, original_node: cst.Module, updated_node: cst.Module): return updated_node.with_changes(body=self.__get_required_imports() + list(updated_node.body)) + def visit_ImportAlias(self, node: cst.ImportAlias): + self.imported_names.extend([n.value for n in match.findall(node.name, match.Name(value=match.DoNotCare()))]) + # TODO: Check the imported modules before adding new ones def __get_required_imports(self): - def find_required_modules(all_types): + def find_required_modules(all_types, imported_names): req_mod = set() for _, a_node in all_types: m = match.findall(a_node.annotation, match.Attribute(value=match.DoNotCare(), attr=match.DoNotCare())) if len(m) != 0: for i in m: - req_mod.add([n.value for n in match.findall(i, match.Name(value=match.DoNotCare()))][0]) + mod_imp = [n.value for n in match.findall(i, match.Name(value=match.DoNotCare()))][0] + if mod_imp not in imported_names: + req_mod.add(mod_imp) + # if n.value not in imported_names + print(req_mod) return req_mod req_imports = [] - all_req_mods = find_required_modules(self.all_applied_types) + self.imported_names = set(self.imported_names) + all_req_mods = find_required_modules(self.all_applied_types, self.imported_names) all_type_names = set(chain.from_iterable(map(lambda t: regex.findall(r"\w+", t[0]), self.all_applied_types))) + all_type_names = all_type_names - self.imported_names typing_imports = PY_TYPING_MOD & all_type_names collection_imports = PY_COLLECTION_MOD & all_type_names diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json index 817ffe7..608d44e 100644 --- a/tests/examples/type_apply_ex.json +++ b/tests/examples/type_apply_ex.json @@ -6,7 +6,8 @@ "typed_seq": "", "imports": [ "pathlib", - "Path" + "Path", + "pandas" ], "variables": { "x": "builtins.int", @@ -42,61 +43,61 @@ "mod_var_ln": { "x": [ [ - 2, + 3, 0 ], [ - 2, + 3, 1 ] ], "l": [ [ - 3, + 4, 0 ], [ - 3, + 4, 1 ] ], "c": [ [ - 4, + 5, 0 ], [ - 4, + 5, 1 ] ], "df": [ [ - 5, + 6, 0 ], [ - 5, + 6, 2 ] ], "dff": [ [ - 6, + 7, 0 ], [ - 6, + 7, 3 ] ], "lit": [ [ - 7, + 8, 0 ], [ - 7, + 8, 3 ] ] @@ -107,11 +108,11 @@ "q_name": "Foo.Delta", "cls_lc": [ [ - 10, + 11, 4 ], [ - 11, + 12, 31 ] ], @@ -124,11 +125,11 @@ "cls_var_ln": { "foo_d": [ [ - 11, + 12, 8 ], [ - 11, + 12, 13 ] ] @@ -140,16 +141,16 @@ "q_name": "Foo", "cls_lc": [ [ - 8, + 9, 0 ], [ - 29, - 16 + 30, + 30 ] ], "variables": { - "foo_v": "str", + "foo_v": "builtins.str", "foo_p": "pathlib.Path" }, "cls_var_occur": { @@ -173,21 +174,21 @@ "cls_var_ln": { "foo_v": [ [ - 29, + 30, 4 ], [ - 29, + 30, 9 ] ], "foo_p": [ [ - 12, + 13, 4 ], [ - 12, + 13, 9 ] ] @@ -198,11 +199,11 @@ "q_name": "Foo.__init__..foo_inner", "fn_lc": [ [ - 15, + 16, 8 ], [ - 16, + 17, 16 ] ], @@ -234,11 +235,11 @@ "q_name": "Foo.__init__", "fn_lc": [ [ - 13, + 14, 4 ], [ - 16, + 17, 16 ] ], @@ -259,11 +260,11 @@ "fn_var_ln": { "i": [ [ - 14, + 15, 8 ], [ - 14, + 15, 14 ] ] @@ -282,11 +283,11 @@ "q_name": "Foo.foo_fn..foo_inner", "fn_lc": [ [ - 18, + 19, 8 ], [ - 19, + 20, 16 ] ], @@ -330,11 +331,11 @@ "q_name": "Foo.foo_fn", "fn_lc": [ [ - 17, + 18, 4 ], [ - 21, + 22, 16 ] ], @@ -370,11 +371,11 @@ "fn_var_ln": { "d": [ [ - 20, + 21, 8 ], [ - 20, + 21, 9 ] ] @@ -394,11 +395,11 @@ "q_name": "Foo.get_e", "fn_lc": [ [ - 23, + 24, 4 ], [ - 24, + 25, 24 ] ], @@ -429,11 +430,11 @@ "q_name": "Foo.get_e", "fn_lc": [ [ - 26, + 27, 4 ], [ - 28, + 29, 24 ] ], @@ -474,11 +475,11 @@ "fn_var_ln": { "foo_v": [ [ - 27, + 28, 8 ], [ - 27, + 28, 17 ] ] @@ -502,11 +503,11 @@ "q_name": "Bar", "fn_lc": [ [ - 30, + 31, 0 ], [ - 33, + 34, 12 ] ], @@ -557,21 +558,21 @@ "fn_var_ln": { "v": [ [ - 31, + 32, 4 ], [ - 31, + 32, 5 ] ], "l": [ [ - 32, + 33, 4 ], [ - 32, + 33, 5 ] ] @@ -593,11 +594,11 @@ null ], "no_types_annot": { - "U": 15, - "D": 14, + "U": 14, + "D": 15, "I": 0 }, - "type_annot_cove": 0.48 + "type_annot_cove": 0.52 } } } diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py index 6b23b63..8607de1 100644 --- a/tests/test_type_apply.py +++ b/tests/test_type_apply.py @@ -5,6 +5,7 @@ import shutil test_file = """from pathlib import Path +import pandas x: int = 12 l = [(1, 2)] c = defaultdict(int) @@ -41,12 +42,12 @@ def Bar(x=['apple', 'orange'], *, c): test_file_exp = """from typing import Tuple, Dict, List, Literal from collections import defaultdict -import pandas import pathlib import builtins import collections import typing from pathlib import Path +import pandas x: builtins.int = 12 l: typing.List[typing.Tuple[builtins.int, builtins.int]] = [(1, 2)] c: collections.defaultdict = defaultdict(int) @@ -74,7 +75,7 @@ def get_e(self): def get_e(self, y: builtins.str): Foo.foo_v = y return Foo.foo_v - foo_v: str = "No" + foo_v: builtins.str = "No" def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[builtins.str]: v: typing.List[builtins.str] = x l = lambda e: e+1 From d37484863b9898a8ef03be8bca4d500001ae5c9f Mon Sep 17 00:00:00 2001 From: mir-am Date: Mon, 2 Aug 2021 14:35:26 +0200 Subject: [PATCH 25/31] (1) Exclude source files in the ignored list for the main pipeline, (2) include type error categories by mypy in the JSON output --- libsa4py/__main__.py | 4 ++- libsa4py/cst_pipeline.py | 64 ++++++++++++++++++++++++---------------- libsa4py/type_check.py | 8 ++--- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py index 8dc283e..4eff86c 100644 --- a/libsa4py/__main__.py +++ b/libsa4py/__main__.py @@ -7,7 +7,8 @@ def process_projects(args): input_repos = find_repos_list(args.p) if args.l is None else find_repos_list(args.p)[:args.l] - p = Pipeline(args.p, args.o, not args.no_nlp, args.use_cache, args.use_pyre, args.use_tc, args.d, args.s) + p = Pipeline(args.p, args.o, not args.no_nlp, args.use_cache, args.use_pyre, args.use_tc, args.d, + args.s, args.i) p.run(input_repos, args.j) @@ -31,6 +32,7 @@ def main(): process_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects") process_parser.add_argument("--d", "--deduplicate", required=False, type=str, help="Path to duplicate files") process_parser.add_argument("--s", "--split", required=False, type=str, help="Path to the dataset split files") + process_parser.add_argument("--i", "--ignore", required=False, type=str, help="Path to the ignored files") process_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects") process_parser.add_argument("--l", required=False, type=int, help="Number of projects to process") process_parser.add_argument("--c", "--cache", dest='use_cache', action='store_true', help="Whether to ignore processed projects") diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index ffa870d..4eaf82e 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -16,7 +16,7 @@ from multiprocessing.queues import Queue from dpu_utils.utils.dataloading import load_jsonl_gz from libsa4py.cst_extractor import Extractor -from libsa4py.cst_transformers import TypeApplier +from libsa4py.cst_transformers import TypeAnnotationRemover, TypeApplier from libsa4py.exceptions import ParseError, NullProjectException from libsa4py.nl_preprocessing import NLPreprocessor from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \ @@ -38,7 +38,7 @@ class Pipeline: def __init__(self, projects_path, output_dir, nlp_transf: bool = True, use_cache: bool = True, use_pyre: bool = False, use_tc: bool = False, - dups_files_path=None, split_files_path=None): + dups_files_path=None, split_files_path=None, ignored_files_path=None): self.projects_path = projects_path self.output_dir = output_dir self.processed_projects = None @@ -60,6 +60,11 @@ def __init__(self, projects_path, output_dir, nlp_transf: bool = True, else: self.is_file_duplicate = lambda x: False + if ignored_files_path is not None: + self.ignored_files = set(read_file(ignored_files_path).splitlines()) + else: + self.ignored_files = {} + if self.use_tc: self.tc = MypyManager('mypy', MAX_TC_TIME) @@ -162,6 +167,8 @@ def process_project(self, i, project): print(f"{project_id} has {len(project_files)} files before deduplication") project_files = [f for f in project_files if not self.is_file_duplicate(f)] print(f"{project_id} has {len(project_files)} files after deduplication") + project_files = [f for f in project_files if str(Path(f).relative_to(Path(self.projects_path).parent)) not in self.ignored_files] + print(f"{project_id} has {len(project_files)} files after ignoring files") project_files = [(f, str(Path(f).relative_to(Path(self.projects_path).parent))) for f in project_files] project_files = [(f, f_r, self.split_dataset_files[f_r] if f_r in self.split_dataset_files else None) for f, @@ -323,6 +330,8 @@ class TypeAnnotationsRemoval: Removes type annotations that cannot be type-checked by mypy """ + MAX_TYPE_ERRORS_PER_FILE = 500 + def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None, dry_run: bool = False, apply_nlp: bool = True): self.projects_path = projects_path @@ -349,12 +358,13 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict): try: tmp_f = create_tmp_file(".py") f_read = read_file(join(self.projects_path, f)) - f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], + f_tc_code, tc_errs, type_annot_r, tc_errors = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], tmp_f) print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}") tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, - "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']} + "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'], + "errors": tc_errors} # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) if not self.dry_run and tc_errs == 0: write_file(join(self.projects_path, f), f_tc_code) @@ -416,7 +426,7 @@ def __load_projects_files(self, q: Queue, is_done): proj_json = load_json(p_j) for _, p_v in proj_json.items(): for f, f_v in p_v['src_files'].items(): - if not f_v['tc'][0] and f_v['tc'] != [False, None] and f_v['tc'][1] <= 100: + if not f_v['tc'][0] and f_v['tc'] != [False, None, None] and f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE: q.put((f, f_v)) f_loaded += 1 #print("Adding files to Queue...") @@ -431,7 +441,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_ MAX_TRY = 10 def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): - tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) nonlocal no_try if no_tc_err is not None: if tc: @@ -440,13 +450,15 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): curr_f_code = f_code curr_no_tc_err = no_tc_err type_annots_removed.append(org_gt) + no_try += 1 else: org_gt_d = org_gt no_try += 1 - return tc, no_tc_err, f_code + return tc, no_tc_err, f_code, tc_errors out_f_code: str = "" + tc_errors = None for m_v, m_v_t in f_d_repr['variables'].items(): if m_v_t != "": print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}") @@ -461,10 +473,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(m_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['variables'][m_v] = m_v_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, m_v_t, f_d_repr['variables'][m_v]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors for i, fn in enumerate(f_d_repr['funcs']): for p_n, p_t in fn['params'].items(): @@ -481,10 +493,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['params'][p_n] = p_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t, f_d_repr['funcs'][i]['params'][p_n]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": @@ -500,10 +512,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, f_d_repr['funcs'][i]['variables'][fn_v]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors # The return type for module-level functions if f_d_repr['funcs'][i]['ret_type'] != "": @@ -520,10 +532,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['ret_type'] = org_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t, f_d_repr['funcs'][i]['ret_type']) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors # The type of class-level vars for c_i, c in enumerate(f_d_repr['classes']): @@ -541,10 +553,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(c_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, c_v_t, f_d_repr['classes'][c_i]['variables'][c_v]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors # The type of arguments for class-level functions for fn_i, fn in enumerate(c['funcs']): @@ -562,10 +574,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors # The type of local variables for class-level functions for fn_v, fn_v_t in fn['variables'].items(): @@ -582,10 +594,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors # The return type for class-level functions if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "": @@ -603,16 +615,16 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t, + tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t, f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed + return f_code, no_tc_err, type_annots_removed, tc_errors - return out_f_code, init_no_tc_err, type_annots_removed + return out_f_code, init_no_tc_err, type_annots_removed, tc_errors def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile): f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr, apply_nlp=self.apply_nlp)) write_to_tmp_file(out_f, f_t_applied.code) - tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME)) - return tc, no_tc_err, f_t_applied.code + tc, no_tc_err, tc_errors = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME)) + return tc, no_tc_err, f_t_applied.code, tc_errors diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py index 40a9f19..770ac41 100644 --- a/libsa4py/type_check.py +++ b/libsa4py/type_check.py @@ -167,13 +167,13 @@ def _report_errors(self, parsed_result): print(f"Error breaking down: {parsed_result.err_breakdown}.") -def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]: +def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None], Union[dict, None]]: try: no_t_err = tc.heavy_assess(f_path) if no_t_err is not None: - return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs) + return (True, 0, no_t_err.err_breakdown) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs, no_t_err.err_breakdown) else: - return False, None + return False, None, None except IndexError: print(f"f: {f_path} - No output from Mypy!") - return False, None + return False, None, None From 729c76d4db822fba1ce4b6196abca7fef6aac2fa Mon Sep 17 00:00:00 2001 From: mir-am Date: Fri, 6 Aug 2021 11:54:47 +0200 Subject: [PATCH 26/31] Putting large projects at the front of the jobs' queue to reduce overall processing time --- libsa4py/cst_pipeline.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 4cbe4c3..5eda048 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -137,7 +137,7 @@ def fn_nlp_transf(fn_d: dict, nlp_prep: NLPreprocessor): return extracted_module - def process_project(self, i, project): + def process_project(self, i, project, project_files: List[str]): project_id = f'{project["author"]}/{project["repo"]}' project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}} @@ -148,7 +148,6 @@ def process_project(self, i, project): print(f'Extracting for {project_id}...') extracted_avl_types = None - project_files = list_files(join(self.projects_path, project["author"], project["repo"])) print(f"{project_id} has {len(project_files)} files before deduplication") project_files = [f for f in project_files if not self.is_file_duplicate(f)] print(f"{project_id} has {len(project_files)} files after deduplication") @@ -233,12 +232,14 @@ def process_project(self, i, project): def run(self, repos_list: List[Dict], jobs, start=0): print(f"Number of projects to be processed: {len(repos_list)}") - repos_list = [p for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)] + repos_list = [(p, list_files(join(self.projects_path, p["author"], p["repo"]))) \ + for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)] + repos_list.sort(key=lambda x: len(x[1]), reverse=True) print(f"Number of projects to be processed after considering cache: {len(repos_list)}") start_t = time.time() ParallelExecutor(n_jobs=jobs)(total=len(repos_list))( - delayed(self.process_project)(i, project) for i, project in enumerate(repos_list, start=start)) + delayed(self.process_project)(i, p, p_files) for i, (p, p_files) in enumerate(repos_list, start=start)) print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t)))) if self.use_pyre: From 35539442f4195d251c1a3cf95efec9fa255b2c0a Mon Sep 17 00:00:00 2001 From: mir-am Date: Fri, 6 Aug 2021 11:57:23 +0200 Subject: [PATCH 27/31] ignore type errors of imported modules and missing imports when type-checking by mypy --- libsa4py/type_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py index 770ac41..4aeaebe 100644 --- a/libsa4py/type_check.py +++ b/libsa4py/type_check.py @@ -126,7 +126,8 @@ def heavy_assess(self, fpath): class MypyManager(TCManager): def _build_tc_cmd(self, fpath): # Mypy needs a flag to display the error codes - return ["mypy", "--show-error-codes", "--no-incremental", "--cache-dir=/dev/null", fpath] + return ["mypy", "--show-error-codes", "--no-incremental", "--cache-dir=/dev/null", + "--follow-imports=silent", "--ignore-missing-imports", fpath] def _check_tc_outcome(self, _, outlines): if any(l.endswith(err) for l in outlines for err in self._inc_errcodes): From 61b4b0cb78802d7ba7ea7dcc632221b5382e20ea Mon Sep 17 00:00:00 2001 From: mir-am Date: Fri, 6 Aug 2021 12:01:03 +0200 Subject: [PATCH 28/31] Improvements to TypeApplier: (1) write ignored files to a separate file (2) fix try attempts for failed type-checking (3) type-check the original file rather than a temp file --- libsa4py/cst_pipeline.py | 71 +++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 4eaf82e..9cc52ea 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -342,7 +342,7 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path self.apply_nlp = apply_nlp #def process_file(self, f: str, f_d_repr: dict, tc_res: dict): - def process_file(self, q: Queue, is_f_loader_done, tc_res: dict): + def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files: list): # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on. # init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f), # MypyManager('mypy', MAX_TC_TIME)) @@ -356,23 +356,33 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict): f, f_d_repr = q.get(True, 1) if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: try: - tmp_f = create_tmp_file(".py") + #tmp_f = create_tmp_file(".py") f_read = read_file(join(self.projects_path, f)) - f_tc_code, tc_errs, type_annot_r, tc_errors = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1], - tmp_f) + _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.projects_path, f), + f_read, f_d_repr, f_d_repr['tc'][1]) print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}") tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r, "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'], "errors": tc_errors} # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) - if not self.dry_run and tc_errs == 0: - write_file(join(self.projects_path, f), f_tc_code) + if tc_errs == 0: + if self.dry_run: + write_file(join(self.projects_path, f), f_read) + else: + write_file(join(self.projects_path, f), f_read) + ignored_files.append(f) except Exception as e: - print(f"f: {f} | e: {e}") + print(f"F: {f} | e: {e}") traceback.print_exc() - finally: - delete_tmp_file(tmp_f) + # finally: + # delete_tmp_file(tmp_f) + else: + print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}") + tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": f_d_repr['tc'][1], "ta_rem": None, + "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'], + "errors": None} + ignored_files.append(f) except queue.Empty as e: print(f"Worker {os.getpid()} finished! Queue's empty!") print(f"File loader working {is_f_loader_done.value} and queue size {q.qsize()}") @@ -381,8 +391,9 @@ def run(self, jobs: int): manager = Manager() q = manager.Queue() is_f_loader_done = manager.Value('i', False) + ignored_files_a = manager.list() - file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done)) + file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a)) file_loader.start() #file_loader.join() @@ -402,14 +413,14 @@ def run(self, jobs: int): time.sleep(5) start_t = time.time() tc_res = manager.dict() + ignored_files_b = manager.list() file_processors = [] for j in range(jobs): - p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res)) + p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res, ignored_files_b)) p.daemon = True file_processors.append(p) p.start() - for p in file_processors: p.join() file_loader.join() @@ -417,8 +428,9 @@ def run(self, jobs: int): # for f, f_d in not_tced_src_f) print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}") save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy()) + write_file(join(self.processed_projects_path, 'ignored_files.txt'), '\n'.join(list(ignored_files_a) + list(ignored_files_b))) - def __load_projects_files(self, q: Queue, is_done): + def __load_projects_files(self, q: Queue, is_done, ignored_files: list): proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json') proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons f_loaded = 0 @@ -426,22 +438,28 @@ def __load_projects_files(self, q: Queue, is_done): proj_json = load_json(p_j) for _, p_v in proj_json.items(): for f, f_v in p_v['src_files'].items(): - if not f_v['tc'][0] and f_v['tc'] != [False, None, None] and f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE: - q.put((f, f_v)) - f_loaded += 1 + if not f_v['tc'][0]: + if f_v['tc'] != [False, None, None]: + if f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE: + q.put((f, f_v)) + f_loaded += 1 + else: + ignored_files.append(f) + else: + ignored_files.append(f) #print("Adding files to Queue...") is_done.value = True print(f"Loaded {f_loaded} Python files") - def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int, - f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]: + def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict, + init_no_tc_err: int) -> Tuple[str, int, List[str]]: type_annots_removed: List[str] = [] no_try = 0 MAX_TRY = 10 def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): - tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp) + tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_path, f_read, f_d_repr) nonlocal no_try if no_tc_err is not None: if tc: @@ -450,10 +468,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): curr_f_code = f_code curr_no_tc_err = no_tc_err type_annots_removed.append(org_gt) - no_try += 1 else: org_gt_d = org_gt no_try += 1 + else: + no_try += 1 return tc, no_tc_err, f_code, tc_errors @@ -622,9 +641,15 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): return out_f_code, init_no_tc_err, type_annots_removed, tc_errors - def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile): + def __type_check_type_annotation(self, f_path: str, f_read: str, f_d_repr: dict): f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr, apply_nlp=self.apply_nlp)) - write_to_tmp_file(out_f, f_t_applied.code) - tc, no_tc_err, tc_errors = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME)) + + # Writing applied code to temp files has an advantage which isolates the file and as a result, + # type-checking may be successful for some failed cases with the original file + # tmp_f = create_tmp_file(".py") + # write_to_tmp_file(tmp_f, f_t_applied.code) + write_file(f_path, f_t_applied.code) + tc, no_tc_err, tc_errors = type_check_single_file(f_path, MypyManager('mypy', MAX_TC_TIME)) + #delete_tmp_file(tmp_f) return tc, no_tc_err, f_t_applied.code, tc_errors From e3ddcc25e4e86827e0588640c22aa0b0654b6b80 Mon Sep 17 00:00:00 2001 From: mir-am Date: Fri, 6 Aug 2021 13:57:12 +0200 Subject: [PATCH 29/31] In the main pipeline, sort projects based on total size of their files --- libsa4py/cst_pipeline.py | 9 +++++---- libsa4py/merge.py | 2 +- libsa4py/utils.py | 8 +++++--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index 5eda048..b645899 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -232,14 +232,15 @@ def process_project(self, i, project, project_files: List[str]): def run(self, repos_list: List[Dict], jobs, start=0): print(f"Number of projects to be processed: {len(repos_list)}") - repos_list = [(p, list_files(join(self.projects_path, p["author"], p["repo"]))) \ + repos_list = [(p, *list_files(join(self.projects_path, p["author"], p["repo"]))) \ for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)] - repos_list.sort(key=lambda x: len(x[1]), reverse=True) + # Sorts projects based on total size of their files + repos_list.sort(key=lambda x: x[2], reverse=True) print(f"Number of projects to be processed after considering cache: {len(repos_list)}") start_t = time.time() ParallelExecutor(n_jobs=jobs)(total=len(repos_list))( - delayed(self.process_project)(i, p, p_files) for i, (p, p_files) in enumerate(repos_list, start=start)) + delayed(self.process_project)(i, p, p_files) for i, (p, p_files, p_size) in enumerate(repos_list, start=start)) print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t)))) if self.use_pyre: @@ -278,6 +279,6 @@ def process_project(self, proj_json_path: str): print(f"Can't parsed file {f} in project {proj_json_path}", pse) def run(self, jobs: int): - proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json') + proj_jsons, _ = list_files(join(self.output_path, 'processed_projects'), '.json') proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True) ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons) diff --git a/libsa4py/merge.py b/libsa4py/merge.py index c5932de..8f0e64c 100644 --- a/libsa4py/merge.py +++ b/libsa4py/merge.py @@ -137,6 +137,6 @@ def merge_projects(args): """ Saves merged projects into a single JSON file and a Dataframe """ - merged_jsons = merge_jsons_to_dict(list_files(join(args.o, 'processed_projects'), ".json"), args.l) + merged_jsons = merge_jsons_to_dict(list_files(join(args.o, 'processed_projects'), ".json")[0], args.l) save_json(join(args.o, 'merged_%s_projects.json' % (str(args.l) if args.l is not None else 'all')), merged_jsons) create_dataframe_fns(args.o, merged_jsons) diff --git a/libsa4py/utils.py b/libsa4py/utils.py index c247c40..c87931e 100644 --- a/libsa4py/utils.py +++ b/libsa4py/utils.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple from tqdm import tqdm from joblib import Parallel from os.path import join, isdir @@ -54,18 +54,20 @@ def tmp(op_iter): # return directory -def list_files(directory: str, file_ext: str = ".py") -> list: +def list_files(directory: str, file_ext: str = ".py") -> Tuple[list, int]: """ List all files in the given directory (recursively) """ filenames = [] + dir_size = 0 for root, dirs, files in os.walk(directory): for filename in files: if filename.endswith(file_ext): filenames.append(os.path.join(root, filename)) + dir_size += Path(os.path.join(root, filename)).stat().st_size - return filenames + return filenames, dir_size def read_file(filename: str) -> str: From 715aea8adaccafc11e705fbbb073cbd0f5204b26 Mon Sep 17 00:00:00 2001 From: mir-am Date: Tue, 10 Aug 2021 16:07:35 +0200 Subject: [PATCH 30/31] Improvements to TypeRemover: (1) Copying input dataset to another dest. for analysis (2) preserve removed type annot. when type errors aren't resolved --- libsa4py/__main__.py | 7 +-- libsa4py/cst_pipeline.py | 94 ++++++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py index 4eff86c..4bcd1e7 100644 --- a/libsa4py/__main__.py +++ b/libsa4py/__main__.py @@ -18,7 +18,7 @@ def apply_types_projects(args): def remove_err_type_annotations(args): - tar = TypeAnnotationsRemoval(args.p, args.o, "", args.l, args.dry_run) + tar = TypeAnnotationsRemoval(args.i, args.o, args.p, args.l, args.dry_run) tar.run(args.j) @@ -65,8 +65,9 @@ def main(): apply_parser.set_defaults(func=apply_types_projects) remove_parser = sub_parsers.add_parser('remove') - remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects") - remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects") + remove_parser.add_argument("--i", required=True, type=str, help="Path to input dataset") + remove_parser.add_argument("--o", required=True, type=str, help="Path to output dataset") + remove_parser.add_argument("--p", required=True, type=str, help="Path to JSON-formatted processed projects") remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files") remove_parser.add_argument("--l", required=False, type=int, help="Number of projects to process") remove_parser.add_argument("--d", dest='dry_run', action='store_true', diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py index a53a554..fd8d979 100644 --- a/libsa4py/cst_pipeline.py +++ b/libsa4py/cst_pipeline.py @@ -20,7 +20,7 @@ from libsa4py.exceptions import ParseError, NullProjectException from libsa4py.nl_preprocessing import NLPreprocessor from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \ - create_tmp_file, write_to_tmp_file, delete_tmp_file + create_tmp_file, write_to_tmp_file, delete_tmp_file, mk_dir_cp_file from libsa4py.pyre import pyre_server_init, pyre_query_types, pyre_server_shutdown, pyre_kill_all_servers, \ clean_pyre_config from libsa4py.type_check import MypyManager, type_check_single_file @@ -333,11 +333,11 @@ class TypeAnnotationsRemoval: MAX_TYPE_ERRORS_PER_FILE = 500 - def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None, + def __init__(self, input_projects_path: str, output_projects_path: str, processed_projects_path: str, no_projects_limit: int = None, dry_run: bool = False, apply_nlp: bool = True): - self.projects_path = projects_path + self.input_projects_path = input_projects_path self.processed_projects_path = processed_projects_path - self.output_path = output_path + self.output_projects_path = output_projects_path self.no_projects_limit = no_projects_limit self.dry_run = dry_run self.apply_nlp = apply_nlp @@ -358,8 +358,8 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files: if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0: try: #tmp_f = create_tmp_file(".py") - f_read = read_file(join(self.projects_path, f)) - _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.projects_path, f), + f_read = read_file(join(self.output_projects_path, f)) + _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.output_projects_path, f), f_read, f_d_repr, f_d_repr['tc'][1]) print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \ total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}") @@ -369,9 +369,9 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files: # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True) if tc_errs == 0: if self.dry_run: - write_file(join(self.projects_path, f), f_read) + write_file(join(self.output_projects_path, f), f_read) else: - write_file(join(self.projects_path, f), f_read) + write_file(join(self.output_projects_path, f), f_read) ignored_files.append(f) except Exception as e: print(f"F: {f} | e: {e}") @@ -393,8 +393,10 @@ def run(self, jobs: int): q = manager.Queue() is_f_loader_done = manager.Value('i', False) ignored_files_a = manager.list() + type_checked_files = manager.list() - file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a)) + file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a, + type_checked_files)) file_loader.start() #file_loader.join() @@ -430,9 +432,10 @@ def run(self, jobs: int): print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}") save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy()) write_file(join(self.processed_projects_path, 'ignored_files.txt'), '\n'.join(list(ignored_files_a) + list(ignored_files_b))) + write_file(join(self.processed_projects_path, 'tced_files.txt'), '\n'.join(list(type_checked_files))) - def __load_projects_files(self, q: Queue, is_done, ignored_files: list): - proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json') + def __load_projects_files(self, q: Queue, is_done, ignored_files: list, type_checked_files: list): + proj_jsons, _ = list_files(join(self.processed_projects_path, 'processed_projects'), '.json') proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons f_loaded = 0 for p_j in proj_jsons: @@ -442,15 +445,24 @@ def __load_projects_files(self, q: Queue, is_done, ignored_files: list): if not f_v['tc'][0]: if f_v['tc'] != [False, None, None]: if f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE: + mk_dir_cp_file(join('/home/amir/data/MT4Py-pyre-apply', f), join(self.output_projects_path, f)) q.put((f, f_v)) f_loaded += 1 + print(f"Added file {f} to the analysis queue") else: ignored_files.append(f) else: ignored_files.append(f) + else: + type_checked_files.append(f) + #print("Adding files to Queue...") is_done.value = True print(f"Loaded {f_loaded} Python files") + + for f in type_checked_files: + mk_dir_cp_file(join(self.input_projects_path, f), join(self.output_projects_path, f)) + print(f"Copied type-checked file: {f}") def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict, init_no_tc_err: int) -> Tuple[str, int, List[str]]: @@ -459,18 +471,16 @@ def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict, no_try = 0 MAX_TRY = 10 - def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): + def type_check_ta(curr_no_tc_err: int, org_gt): tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_path, f_read, f_d_repr) nonlocal no_try if no_tc_err is not None: if tc: type_annots_removed.append(org_gt) elif no_tc_err < curr_no_tc_err: - curr_f_code = f_code curr_no_tc_err = no_tc_err type_annots_removed.append(org_gt) else: - org_gt_d = org_gt no_try += 1 else: no_try += 1 @@ -493,10 +503,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(m_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['variables'][m_v] = m_v_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, m_v_t, - f_d_repr['variables'][m_v]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, m_v_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['variables'][m_v] = m_v_t for i, fn in enumerate(f_d_repr['funcs']): for p_n, p_t in fn['params'].items(): @@ -513,10 +524,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['params'][p_n] = p_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t, - f_d_repr['funcs'][i]['params'][p_n]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, p_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['funcs'][i]['params'][p_n] = p_t for fn_v, fn_v_t in fn['variables'].items(): if fn_v_t != "": @@ -532,10 +544,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, - f_d_repr['funcs'][i]['variables'][fn_v]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, fn_v_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t # The return type for module-level functions if f_d_repr['funcs'][i]['ret_type'] != "": @@ -552,10 +565,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['funcs'][i]['ret_type'] = org_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t, - f_d_repr['funcs'][i]['ret_type']) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, org_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['funcs'][i]['ret_type'] = org_t # The type of class-level vars for c_i, c in enumerate(f_d_repr['classes']): @@ -573,10 +587,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(c_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, c_v_t, - f_d_repr['classes'][c_i]['variables'][c_v]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, c_v_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t # The type of arguments for class-level functions for fn_i, fn in enumerate(c['funcs']): @@ -594,10 +609,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(p_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, p_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t # The type of local variables for class-level functions for fn_v, fn_v_t in fn['variables'].items(): @@ -614,10 +630,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(fn_v_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v]) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, fn_v_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t # The return type for class-level functions if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "": @@ -635,10 +652,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d): # type_annots_removed.append(org_t) # elif no_tc_err == init_no_tc_err: # f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t, - f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']) + tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, org_t) if tc or no_try > MAX_TRY: - return f_code, no_tc_err, type_annots_removed, tc_errors + return out_f_code, no_tc_err, type_annots_removed, tc_errors + else: + f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t return out_f_code, init_no_tc_err, type_annots_removed, tc_errors From 060c4bef721d05a4a970279b0319f250d84e86b7 Mon Sep 17 00:00:00 2001 From: mir-am Date: Tue, 10 Aug 2021 16:08:48 +0200 Subject: [PATCH 31/31] Add a utility method to copy files while making required dirs --- libsa4py/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libsa4py/utils.py b/libsa4py/utils.py index fdae75f..569f9f5 100644 --- a/libsa4py/utils.py +++ b/libsa4py/utils.py @@ -1,3 +1,4 @@ +import shutil from typing import List, Tuple from tqdm import tqdm from joblib import Parallel @@ -82,6 +83,13 @@ def write_file(filename: str, content: str): with open(filename, 'w') as file: file.write(content) +def mk_dir_cp_file(src_path: str, dest_path: str): + """ + Creates directories in the destination if not exists and copy the given file + """ + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + shutil.copy(src_path, dest_path) + def save_json(filename: str, dict_obj: dict): """ Dumps a dict object into a JSON file