From 39660d5d5fe1c94c8eba97c57443935703e844b8 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Wed, 30 Jun 2021 18:16:17 +0200
Subject: [PATCH 01/31] Removing type annotatations that do not type check by
 mypy [WIP] - Part 1

---
 libsa4py/cst_pipeline.py                | 166 +++++++++++++++++-
 libsa4py/cst_transformers.py            |  41 ++++-
 libsa4py/type_check.py                  |   7 +-
 libsa4py/utils.py                       |  22 +++
 tests/examples/type_apply_ex.json       |   6 +-
 tests/examples/type_apply_typed_ex.json | 223 ++++++++++++++++++++++++
 tests/test_type_apply.py                |  52 +++++-
 7 files changed, 500 insertions(+), 17 deletions(-)
 create mode 100644 tests/examples/type_apply_typed_ex.json

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 4cbe4c3..b62f00f 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -4,8 +4,9 @@
 import csv
 import time
 
-from typing import List, Dict
+from typing import List, Dict, Tuple
 from os.path import join
+from tempfile import NamedTemporaryFile
 from pathlib import Path
 from datetime import timedelta
 from joblib import delayed
@@ -14,7 +15,8 @@
 from libsa4py.cst_transformers import TypeApplier
 from libsa4py.exceptions import ParseError, NullProjectException
 from libsa4py.nl_preprocessing import NLPreprocessor
-from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file
+from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \
+    create_tmp_file, write_to_tmp_file, delete_tmp_file
 from libsa4py.pyre import pyre_server_init, pyre_query_types, pyre_server_shutdown, pyre_kill_all_servers, \
     clean_pyre_config
 from libsa4py.type_check import MypyManager, type_check_single_file
@@ -280,3 +282,163 @@ def run(self, jobs: int):
         proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json')
         proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True)
         ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons)
+
+
+class TypeAnnotationsRemoval:
+    """
+    Removes type annotations that cannot be type-checked by mypy
+    """
+
+    def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, apply_nlp: bool = True):
+        self.projects_path = projects_path
+        self.processed_projects_path = processed_projects_path
+        self.output_path = output_path
+        self.apply_nlp = apply_nlp
+
+    def process_file(self, f:str, f_d_repr: dict):
+        f_read = read_file(join(self.projects_path, f))
+        # TODO: The inital type-checking should not be done after adding no. type errors to the representation later on.
+        init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f),
+                                                         MypyManager('mypy', MAX_TC_TIME))
+
+        if init_tc == False and init_no_tc_err is None:
+            return
+        else:
+            self.__remove_unchecked_type_annot(f_read, f_d_repr, )
+
+
+    def run(self, jobs: int):
+        self.merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
+        not_tced_src_f: List[Tuple[str, dict]] = []
+        for p, p_v in list(self.merged_projects['projects'].items()):
+            for f, f_v in p_v['src_files'].items():
+                if not f_v['tc']:
+                    not_tced_src_f.append((f, f_v))
+
+    def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int):
+        tmp_f = create_tmp_file(".py")
+        out_f_code: str = ""
+        for m_v, m_v_t in f_d_repr['variables'].items():
+            if m_v_t != "":
+                print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}")
+                f_d_repr['variables'][m_v] = ""
+                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                if tc:
+                    return f_code
+                elif no_tc_err < init_no_tc_err:
+                    out_f_code = f_code
+                elif no_tc_err == init_no_tc_err:
+                    f_d_repr['variables'][m_v] = m_v_t
+
+        for i, fn in enumerate(f_d_repr['funcs']):
+            for p_n, p_t in fn['params'].items():
+                if p_t != "":
+                    print(f"Type-checking function parameter {p_n} with annotation {p_t}")
+                    f_d_repr['funcs'][i]['params'][p_n] = ""
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    if tc:
+                        return f_code
+                    elif no_tc_err < init_no_tc_err:
+                        out_f_code = f_code
+                    elif no_tc_err == init_no_tc_err:
+                        f_d_repr['funcs'][i]['params'][p_n] = p_t
+
+            for fn_v, fn_v_t in fn['variables'].items():
+                if fn_v_t != "":
+                    print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
+                    f_d_repr['funcs'][i]['variables'][fn_v] = ""
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    if tc:
+                        return f_code
+                    elif no_tc_err < init_no_tc_err:
+                        out_f_code = f_code
+                    elif no_tc_err == init_no_tc_err:
+                        f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
+
+            # The return type for module-level functions
+            if f_d_repr['funcs'][i]['ret_type'] != "":
+                org_t = f_d_repr['funcs'][i]['ret_type']
+                print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}")
+                f_d_repr['funcs'][i]['ret_type'] = ""
+                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                if tc:
+                    return f_code
+                elif no_tc_err < init_no_tc_err:
+                    out_f_code = f_code
+                elif no_tc_err == init_no_tc_err:
+                    f_d_repr['funcs'][i]['ret_type'] = org_t
+
+        # The type of class-level vars
+        for c_i, c in enumerate(f_d_repr['classes']):
+            for c_v, c_v_t in c['variables'].items():
+                if c_v_t != "":
+                    print(f"Type checking class variable {c_v} with annotation {c_v_t}")
+                    f_d_repr['classes'][c_i]['variables'][c_v] = ""
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    if tc:
+                        return f_code
+                    elif no_tc_err < init_no_tc_err:
+                        out_f_code = f_code
+                    elif no_tc_err == init_no_tc_err:
+                        f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
+
+            # The type of arguments for class-level functions
+            for fn_i, fn in enumerate(c['funcs']):
+                for p_n, p_t in fn["params"].items():
+                    if p_t != "":
+                        print(f"Type-checking function parameter {p_n} with annotation {p_t}")
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p
+                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                        if tc:
+                            return f_code
+                        elif no_tc_err < init_no_tc_err:
+                            out_f_code = f_code
+                        elif no_tc_err == init_no_tc_err:
+                            f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
+
+                # The type of local variables for class-level functions
+                for fn_v, fn_v_t in fn['variables'].items():
+                    if fn_v_t != "":
+                        print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p
+                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                        if tc:
+                            return f_code
+                        elif no_tc_err < init_no_tc_err:
+                            out_f_code = f_code
+                        elif no_tc_err == init_no_tc_err:
+                            f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
+
+                # The return type for class-level functions
+                if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "":
+                    org_t = f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']
+                    print(f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}")
+                    f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = ""
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    if tc:
+                        return f_code
+                    elif no_tc_err < init_no_tc_err:
+                        out_f_code = f_code
+                    elif no_tc_err == init_no_tc_err:
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
+
+        # apply_inferred_types(src_f_read, src_f_ext, src_f_o_path)
+        delete_tmp_file(tmp_f)
+        return out_f_code
+
+    def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile):
+        f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr,
+                                                                                               apply_nlp=self.apply_nlp))
+        write_to_tmp_file(out_f, f_t_applied.code)
+        tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME))
+        return tc, no_tc_err, f_t_applied.code
+
+
+
+
+
+
+
+
+
+
diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index f115b91..23f18b8 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -962,9 +962,17 @@ def __get_var_type_an_assign(self, var_name: str):
     def __get_var_names_counter(self, node, scope):
         vars_name = match.extractall(node, match.OneOf(match.AssignTarget(target=match.SaveMatchedNode(
             match.Name(value=match.DoNotCare()), "name")), match.AnnAssign(target=match.SaveMatchedNode(
-            match.Name(value=match.DoNotCare()), "name"))))
+            match.Name(value=match.DoNotCare()), "name"))
+        ))
+        attr_name = match.extractall(node, match.OneOf(match.AssignTarget(
+                target=match.SaveMatchedNode(match.Attribute(value=match.Name(value=match.DoNotCare()), attr=
+            match.Name(value=match.DoNotCare())), "attr")),
+            match.AnnAssign(target=match.SaveMatchedNode(match.Attribute(value=match.Name(value=match.DoNotCare()), attr=
+            match.Name(value=match.DoNotCare())), "attr"))))
         return Counter([n['name'].value for n in vars_name if isinstance(self.get_metadata(cst.metadata.ScopeProvider,
-                                                                                           n['name']), scope)])
+                                                                                           n['name']), scope)] +
+                       [n['attr'].attr.value for n in attr_name if isinstance(self.get_metadata(cst.metadata.ScopeProvider,
+                                                                                           n['attr']), scope)])
 
     def visit_ClassDef(self, node: cst.ClassDef):
         self.cls_visited.append((self.__get_cls(node),
@@ -987,6 +995,8 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu
             if fn_ret_type is not None:
                 self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type))
                 return updated_node.with_changes(returns=fn_ret_type)
+        else:
+            return updated_node.with_changes(returns=None)
 
         return updated_node
 
@@ -999,9 +1009,16 @@ def leave_Lambda(self, original_node: cst.Lambda, updated_node: cst.Lambda):
 
     def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                                   updated_node: cst.SimpleStatementLine):
+
+        # Untyped variables
         if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
+                target=match.DoNotCare())])])):
+            if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
                 target=match.Name(value=match.DoNotCare()))])])):
-            t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value)
+                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value)
+            elif match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
+                target=match.Attribute(value=match.Name(value=match.DoNotCare()), attr=match.Name(value=match.DoNotCare())))])])):
+                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value)
 
             if t is not None:
                 t_annot_node_resolved = self.resolve_type_alias(t)
@@ -1015,9 +1032,14 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         equal=cst.AssignEqual(whitespace_after=original_node.body[0].targets[0].whitespace_after_equal,
                                             whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))]
                     )
-        elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])):
-            t = self.__get_var_type_an_assign(original_node.body[0].target.value)
-            if t is not None:
+        # Typed variables
+        elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare())])):
+            if match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])):
+                t = self.__get_var_type_an_assign(original_node.body[0].target.value)
+            elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Attribute(value=match.Name(value=match.DoNotCare()),
+                                                                                              attr=match.Name(value=match.DoNotCare())))])):
+                t = self.__get_var_type_an_assign(original_node.body[0].target.attr.value)
+            if t:
                 t_annot_node_resolved = self.resolve_type_alias(t)
                 t_annot_node = self.__name2annotation(t_annot_node_resolved)
                 if t_annot_node is not None:
@@ -1027,6 +1049,11 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         value=original_node.body[0].value,
                         annotation=t_annot_node,
                         equal=original_node.body[0].equal)])
+            else:
+                return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
+                                                                                            whitespace_before_equal=original_node.body[0].equal.whitespace_before,
+                                                                                            whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
+                                                                  value=original_node.body[0].value)])
 
         return original_node
 
@@ -1035,6 +1062,8 @@ def leave_Param(self, original_node: cst.Param, updated_node: cst.Param):
             fn_param_type = self.__get_fn_param_type(original_node.name.value)
             if fn_param_type is not None:
                 return updated_node.with_changes(annotation=fn_param_type)
+            else:
+                return updated_node.with_changes(annotation=None)
 
         return original_node
 
diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py
index fd5cbb2..ab97d2d 100644
--- a/libsa4py/type_check.py
+++ b/libsa4py/type_check.py
@@ -7,6 +7,7 @@
 
 from abc import ABC, abstractmethod
 from os.path import dirname, basename
+from typing import Tuple, Union
 from collections import Counter, namedtuple
 import toml
 import os
@@ -164,9 +165,9 @@ def _report_errors(self, parsed_result):
             print(f"Error breaking down: {parsed_result.err_breakdown}.")
 
 
-def type_check_single_file(f_path: str, tc: TCManager) -> bool:
+def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]:
     no_t_err = tc.heavy_assess(f_path)
     if no_t_err is not None:
-        return True if no_t_err.no_type_errs == 0 else False
+        return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs)
     else:
-        return False
+        return False, None
diff --git a/libsa4py/utils.py b/libsa4py/utils.py
index c247c40..5519b27 100644
--- a/libsa4py/utils.py
+++ b/libsa4py/utils.py
@@ -2,6 +2,7 @@
 from tqdm import tqdm
 from joblib import Parallel
 from os.path import join, isdir
+from tempfile import NamedTemporaryFile
 from pathlib import Path
 import time
 import os
@@ -113,3 +114,24 @@ def find_repos_list(projects_path: str) -> List[dict]:
 def mk_dir_not_exist(path: str):
     if not isdir(path):
         os.mkdir(path)
+
+
+def create_tmp_file(suffix: str):
+    """
+    It creates a temporary file.
+    NOTE: the temp file should be deleted manually after creation.
+    """
+    return NamedTemporaryFile(mode="w", delete=False, suffix=suffix)
+
+
+def delete_tmp_file(tmp_f: NamedTemporaryFile):
+    try:
+        os.unlink(tmp_f.name)
+    except TypeError:
+        print("Couldn't delete ", tmp_f.name)
+
+
+def write_to_tmp_file(tmp_f: NamedTemporaryFile, text: str):
+    tmp_f.write(text)
+    #tmp_f.close()
+    return tmp_f
diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json
index 39e4227..1727f94 100644
--- a/tests/examples/type_apply_ex.json
+++ b/tests/examples/type_apply_ex.json
@@ -64,7 +64,7 @@
             "name": "Foo",
             "q_name": "Foo",
             "variables": {
-                "foo_v": "",
+                "foo_v": "str",
                 "foo_p": "pathlib.Path"
             },
             "cls_var_occur": {
@@ -134,11 +134,11 @@
                             16
                         ]
                     ],
-                    "params": {},
+                    "params": {"self":  ""},
                     "ret_exprs": [],
                     "params_occur": {},
                     "ret_type": "",
-                    "variables": {},
+                    "variables": {"i":  "int"},
                     "fn_var_occur": {},
                     "params_descr": {},
                     "docstring": {
diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json
new file mode 100644
index 0000000..85bd099
--- /dev/null
+++ b/tests/examples/type_apply_typed_ex.json
@@ -0,0 +1,223 @@
+{
+  "tests/examples": {
+    "src_files": {
+      "type_apply_typed.py": {
+    "untyped_seq": "a = [number] [EOL] l = [ [number] , [number] , [number] ] [EOL] c = [number] [EOL] [EOL] def foo ( x , y ) : [EOL] z = x + y [EOL] return z [EOL] [EOL] class Bar : [EOL] bar_var1 = [string] [EOL] bar_var2 = [number] [EOL] def __init__ ( a , b ) : [EOL] self . a = a [EOL] self . b = b [EOL] def delta ( n ) : [EOL] return [ [number] ] * p [EOL]",
+    "typed_seq": "$builtins.int$ 0 0 0 $List[int]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 $builtins.float$ 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 $builtins.int$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 $List[float]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0",
+    "imports": [],
+    "variables": {
+        "a": "",
+        "l": "",
+        "c": ""
+    },
+    "mod_var_occur": {
+        "a": [
+            [
+                "self",
+                "a",
+                "builtins",
+                "int",
+                "a"
+            ]
+        ],
+        "l": [],
+        "c": []
+    },
+    "classes": [
+        {
+            "name": "Bar",
+            "q_name": "Bar",
+            "variables": {
+                "bar_var1": "",
+                "bar_var2": ""
+            },
+            "cls_var_occur": {
+                "bar_var1": [],
+                "bar_var2": []
+            },
+            "funcs": [
+                {
+                    "name": "__init__",
+                    "q_name": "Bar.__init__",
+                    "fn_lc": [
+                        [
+                            12,
+                            4
+                        ],
+                        [
+                            14,
+                            18
+                        ]
+                    ],
+                    "params": {
+                        "a": "",
+                        "b": ""
+                    },
+                    "ret_exprs": [],
+                    "params_occur": {
+                        "a": [
+                            [
+                                "self",
+                                "a",
+                                "builtins",
+                                "int",
+                                "a"
+                            ]
+                        ],
+                        "b": [
+                            [
+                                "self",
+                                "b",
+                                "b"
+                            ]
+                        ]
+                    },
+                    "ret_type": "",
+                    "variables": {
+                        "a": "",
+                        "b": ""
+                    },
+                    "fn_var_occur": {
+                        "a": [
+                            [
+                                "self",
+                                "a",
+                                "builtins",
+                                "int",
+                                "a"
+                            ]
+                        ],
+                        "b": [
+                            [
+                                "self",
+                                "b",
+                                "b"
+                            ]
+                        ]
+                    },
+                    "params_descr": {
+                        "a": "",
+                        "b": ""
+                    },
+                    "docstring": {
+                        "func": null,
+                        "ret": null,
+                        "long_descr": null
+                    }
+                },
+                {
+                    "name": "delta",
+                    "q_name": "Bar.delta",
+                    "fn_lc": [
+                        [
+                            15,
+                            4
+                        ],
+                        [
+                            16,
+                            25
+                        ]
+                    ],
+                    "params": {
+                        "n": ""
+                    },
+                    "ret_exprs": [
+                        "return [2.17] * p"
+                    ],
+                    "params_occur": {
+                        "n": []
+                    },
+                    "ret_type": "",
+                    "variables": {},
+                    "fn_var_occur": {},
+                    "params_descr": {
+                        "n": ""
+                    },
+                    "docstring": {
+                        "func": null,
+                        "ret": null,
+                        "long_descr": null
+                    }
+                }
+            ]
+        }
+    ],
+    "funcs": [
+        {
+            "name": "foo",
+            "q_name": "foo",
+            "fn_lc": [
+                [
+                    5,
+                    0
+                ],
+                [
+                    7,
+                    12
+                ]
+            ],
+            "params": {
+                "x": "",
+                "y": ""
+            },
+            "ret_exprs": [
+                "return z"
+            ],
+            "params_occur": {
+                "x": [
+                    [
+                        "z",
+                        "builtins",
+                        "int",
+                        "x",
+                        "y"
+                    ]
+                ],
+                "y": [
+                    [
+                        "z",
+                        "builtins",
+                        "int",
+                        "x",
+                        "y"
+                    ]
+                ]
+            },
+            "ret_type": "",
+            "variables": {
+                "z": ""
+            },
+            "fn_var_occur": {
+                "z": [
+                    [
+                        "z",
+                        "builtins",
+                        "int",
+                        "x",
+                        "y"
+                    ]
+                ]
+            },
+            "params_descr": {
+                "x": "",
+                "y": ""
+            },
+            "docstring": {
+                "func": null,
+                "ret": null,
+                "long_descr": null
+            }
+        }
+    ],
+    "set": null,
+    "tc": false,
+    "no_types_annot": {
+        "U": 0,
+        "D": 0,
+        "I": 0
+    },
+    "type_annot_cove": 0.0
+}
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 5fb08c4..611258d 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -16,7 +16,8 @@ class Foo:
     class Delta:
         foo_d = 'Hello, Delta!'
     foo_p = Path('/home/foo/bar')
-    def __init__():
+    def __init__(self):
+        self.i = 10
         def foo_inner(c, d=lambda a,b: a == b):
             pass
     def foo_fn(self, y):
@@ -57,7 +58,8 @@ class Foo:
     class Delta:
         foo_d = 'Hello, Delta!'
     foo_p: pathlib.Path = Path('/home/foo/bar')
-    def __init__():
+    def __init__(self):
+        self.i: int = 10
         def foo_inner(c: str, d=lambda a,b: a == b):
             pass
     def foo_fn(self, y)-> typing.Dict[builtins.str, builtins.bool]:
@@ -79,6 +81,38 @@ def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[bu
     return v
 """
 
+test_file_typed = """a: int = 12
+l: List[int] = [1,2,3]
+c = 2.71
+def foo(x: int, y: int) -> int:
+    z: int = x + y
+    return z
+class Bar:
+    bar_var1: str = "Hello, Bar!"
+    bar_var2: float = 3.14
+    def __init__(a: int, b):
+        self.a: int = a
+        self.b = b
+    def delta(n: int) -> List[float]:
+        return [2.17] * p
+"""
+
+test_file_typed_exp = """a = 12
+l = [1,2,3]
+c = 2.71
+def foo(x, y):
+    z = x + y
+    return z
+class Bar:
+    bar_var1 = "Hello, Bar!"
+    bar_var2 = 3.14
+    def __init__(a, b):
+        self.a = a
+        self.b = b
+    def delta(n):
+        return [2.17] * p
+"""
+
 
 class TestTypeAnnotatingProjects(unittest.TestCase):
     """
@@ -92,8 +126,11 @@ def __init__(self, *args, **kwargs):
     def setUpClass(cls):
         mk_dir_not_exist('./tmp_ta')
         write_file('./tmp_ta/type_apply.py', test_file)
+        write_file('./tmp_ta/type_apply_typed.py', test_file_typed)
+
         # from libsa4py.cst_extractor import Extractor
-        # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict())
+        # # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict())
+        # save_json('./tmp_ta/type_apply_typed_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply_typed.py')).to_dict())
 
     def test_type_apply_pipeline(self):
         ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False)
@@ -109,6 +146,15 @@ def test_type_apply_pipeline(self):
         # The imported types from typing
         self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7])))
 
+    def test_type_apply_remove_annot(self):
+        """
+        Tests the removal of type annotations if not present in the JSON output
+        """
+        ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False)
+        ta.process_project('./examples/type_apply_typed_ex.json')
+
+        self.assertEqual(test_file_typed_exp, read_file('./tmp_ta/type_apply_typed.py'))
+
     @classmethod
     def tearDownClass(cls):
         shutil.rmtree("./tmp_ta/")

From 54ba46b723a3b47620a668dcb43b8f943e71d759 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 10:24:39 +0200
Subject: [PATCH 02/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 2

---
 libsa4py/__main__.py     |  13 ++++-
 libsa4py/cst_pipeline.py | 109 +++++++++++++++++++++------------------
 2 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py
index fc891cf..16a0b31 100644
--- a/libsa4py/__main__.py
+++ b/libsa4py/__main__.py
@@ -1,7 +1,7 @@
 from argparse import ArgumentParser
 from multiprocessing import cpu_count
 from libsa4py.utils import find_repos_list
-from libsa4py.cst_pipeline import Pipeline, TypeAnnotatingProjects
+from libsa4py.cst_pipeline import Pipeline, TypeAnnotatingProjects, TypeAnnotationsRemoval
 from libsa4py.merge import merge_projects
 
 
@@ -16,6 +16,11 @@ def apply_types_projects(args):
     tap.run(args.j)
 
 
+def remove_err_type_annotations(args):
+    tar = TypeAnnotationsRemoval(args.p, args.o, "")
+    tar.run(args.j)
+
+
 def main():
 
     arg_parser = ArgumentParser(description="Light-weight static analysis to extract Python's code representations")
@@ -53,6 +58,12 @@ def main():
     apply_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects")
     apply_parser.set_defaults(func=apply_types_projects)
 
+    remove_parser = sub_parsers.add_parser('remove')
+    remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects")
+    remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects")
+    remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files")
+    remove_parser.set_defaults(func=remove_err_type_annotations)
+
     args = arg_parser.parse_args()
     args.func(args)
 
diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index b62f00f..042cb7b 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -59,7 +59,8 @@ def __init__(self, projects_path, output_dir, nlp_transf: bool = True,
         if self.use_tc:
             self.tc = MypyManager('mypy', MAX_TC_TIME)
 
-        self.split_dataset_files = {f:s for s, f in csv.reader(open(split_files_path, 'r'))} if split_files_path is not None else {}
+        self.split_dataset_files = {f: s for s, f in
+                                    csv.reader(open(split_files_path, 'r'))} if split_files_path is not None else {}
 
         # TODO: Fix the logger issue not outputing the logs into the file.
         # logging.basicConfig(filename=join(self.err_log_dir, "pipeline_errors.log"), level=logging.DEBUG,
@@ -83,17 +84,17 @@ def __setup_pipeline_logger(self, log_dir: str):
 
         logger_ch = logging.StreamHandler()
         logger_ch.setLevel(logging.DEBUG)
-    
+
         logger_fh = logging.FileHandler(filename=log_dir)
         logger_fh.setLevel(logging.DEBUG)
-    
+
         logger_formatter = logging.Formatter(fmt='%(asctime)s - %(name)s - %(message)s')
         logger_ch.setFormatter(logger_formatter)
         logger_fh.setFormatter(logger_formatter)
 
         logger.addHandler(logger_ch)
         logger.addHandler(logger_fh)
-    
+
         return logger
 
     def get_project_filename(self, project) -> str:
@@ -125,14 +126,17 @@ def fn_nlp_transf(fn_d: dict, nlp_prep: NLPreprocessor):
             fn_d['docstring']['long_descr'] = nlp_prep.process_sentence(fn_d['docstring']['long_descr'])
             return fn_d
 
-        extracted_module['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in extracted_module['variables'].items()}
+        extracted_module['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in
+                                         extracted_module['variables'].items()}
         extracted_module['mod_var_occur'] = {v: [self.nlp_prep.process_sentence(j) for i in o for j in i] for v,
-                                             o in extracted_module['mod_var_occur'].items()}
+                                                                                                              o in
+                                             extracted_module['mod_var_occur'].items()}
 
         for c in extracted_module['classes']:
             c['variables'] = {self.nlp_prep.process_identifier(v): t for v, t in c['variables'].items()}
             c['cls_var_occur'] = {v: [self.nlp_prep.process_sentence(j) for i in o for j in i] for v,
-                                  o in c['cls_var_occur'].items()}
+                                                                                                   o in
+                                  c['cls_var_occur'].items()}
             c['funcs'] = [fn_nlp_transf(f, self.nlp_prep) for f in c['funcs']]
 
         extracted_module['funcs'] = [fn_nlp_transf(f, self.nlp_prep) for f in extracted_module['funcs']]
@@ -157,7 +161,8 @@ def process_project(self, i, project):
 
             project_files = [(f, str(Path(f).relative_to(Path(self.projects_path).parent))) for f in project_files]
             project_files = [(f, f_r, self.split_dataset_files[f_r] if f_r in self.split_dataset_files else None) for f,
-                             f_r in project_files]
+                                                                                                                      f_r
+                             in project_files]
 
             if len(project_files) != 0:
                 if self.use_pyre:
@@ -195,10 +200,10 @@ def process_project(self, i, project):
                         # fail the entire project processing.
                         # TODO: A better workaround would be to have a specialized exception thrown
                         # by the extractor, so that this exception is specialized.
-                        #print(f"Could not process file {filename}")
+                        # print(f"Could not process file {filename}")
                         traceback.print_exc()
                         self.logger.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err))
-                        #logging.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err))
+                        # logging.error("project: %s |file: %s |Exception: %s" % (project_id, filename, err))
 
                 print(f'Saving available type hints for {project_id}...')
                 if self.avl_types_dir is not None:
@@ -241,7 +246,8 @@ def run(self, repos_list: List[Dict], jobs, start=0):
         start_t = time.time()
         ParallelExecutor(n_jobs=jobs)(total=len(repos_list))(
             delayed(self.process_project)(i, project) for i, project in enumerate(repos_list, start=start))
-        print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t))))
+        print(
+            "Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time() - start_t))))
 
         if self.use_pyre:
             pyre_kill_all_servers()
@@ -295,38 +301,47 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path
         self.output_path = output_path
         self.apply_nlp = apply_nlp
 
-    def process_file(self, f:str, f_d_repr: dict):
+    def process_file(self, f: str, f_d_repr: dict):
         f_read = read_file(join(self.projects_path, f))
-        # TODO: The inital type-checking should not be done after adding no. type errors to the representation later on.
+        # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on.
         init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f),
                                                          MypyManager('mypy', MAX_TC_TIME))
 
         if init_tc == False and init_no_tc_err is None:
             return
         else:
-            self.__remove_unchecked_type_annot(f_read, f_d_repr, )
-
+            tmp_f = create_tmp_file(".py")
+            f_tc_code, tc_errs = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f)
+            print(f"F: {Path(f).name} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs}")
+            # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
+            # write_file(join(self.projects_path, f), f_tc_code)
+            delete_tmp_file(tmp_f)
 
     def run(self, jobs: int):
-        self.merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
+        merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
         not_tced_src_f: List[Tuple[str, dict]] = []
-        for p, p_v in list(self.merged_projects['projects'].items()):
+        for p, p_v in list(merged_projects['projects'].items()):
             for f, f_v in p_v['src_files'].items():
                 if not f_v['tc']:
                     not_tced_src_f.append((f, f_v))
 
-    def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int):
-        tmp_f = create_tmp_file(".py")
+        ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d) \
+                                                                 for f, f_d in not_tced_src_f)
+
+    def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
+                                      f_out_temp: NamedTemporaryFile) -> Tuple[str, int]:
+
         out_f_code: str = ""
         for m_v, m_v_t in f_d_repr['variables'].items():
             if m_v_t != "":
                 print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}")
                 f_d_repr['variables'][m_v] = ""
-                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                 if tc:
-                    return f_code
+                    return f_code, no_tc_err
                 elif no_tc_err < init_no_tc_err:
                     out_f_code = f_code
+                    init_no_tc_err = no_tc_err
                 elif no_tc_err == init_no_tc_err:
                     f_d_repr['variables'][m_v] = m_v_t
 
@@ -335,11 +350,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 if p_t != "":
                     print(f"Type-checking function parameter {p_n} with annotation {p_t}")
                     f_d_repr['funcs'][i]['params'][p_n] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code
+                        return f_code, no_tc_err
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
+                        init_no_tc_err = no_tc_err
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['funcs'][i]['params'][p_n] = p_t
 
@@ -347,11 +363,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 if fn_v_t != "":
                     print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
                     f_d_repr['funcs'][i]['variables'][fn_v] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code
+                        return f_code, no_tc_err
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
+                        init_no_tc_err = no_tc_err
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
 
@@ -360,11 +377,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 org_t = f_d_repr['funcs'][i]['ret_type']
                 print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}")
                 f_d_repr['funcs'][i]['ret_type'] = ""
-                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                 if tc:
-                    return f_code
+                    return f_code, no_tc_err
                 elif no_tc_err < init_no_tc_err:
                     out_f_code = f_code
+                    init_no_tc_err = no_tc_err
                 elif no_tc_err == init_no_tc_err:
                     f_d_repr['funcs'][i]['ret_type'] = org_t
 
@@ -374,11 +392,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 if c_v_t != "":
                     print(f"Type checking class variable {c_v} with annotation {c_v_t}")
                     f_d_repr['classes'][c_i]['variables'][c_v] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code
+                        return f_code, no_tc_err
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
+                        init_no_tc_err = no_tc_err
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
 
@@ -388,11 +407,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     if p_t != "":
                         print(f"Type-checking function parameter {p_n} with annotation {p_t}")
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p
-                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
-                            return f_code
+                            return f_code, no_tc_err
                         elif no_tc_err < init_no_tc_err:
                             out_f_code = f_code
+                            init_no_tc_err = no_tc_err
                         elif no_tc_err == init_no_tc_err:
                             f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
 
@@ -401,30 +421,31 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     if fn_v_t != "":
                         print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p
-                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
-                            return f_code
+                            return f_code, no_tc_err
                         elif no_tc_err < init_no_tc_err:
                             out_f_code = f_code
+                            init_no_tc_err = no_tc_err
                         elif no_tc_err == init_no_tc_err:
                             f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
 
                 # The return type for class-level functions
                 if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "":
                     org_t = f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type']
-                    print(f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}")
+                    print(
+                        f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}")
                     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, tmp_f)
+                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code
+                        return f_code, no_tc_err
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
+                        init_no_tc_err = no_tc_err
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
 
-        # apply_inferred_types(src_f_read, src_f_ext, src_f_o_path)
-        delete_tmp_file(tmp_f)
-        return out_f_code
+        return out_f_code, init_no_tc_err
 
     def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile):
         f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr,
@@ -432,13 +453,3 @@ def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: Named
         write_to_tmp_file(out_f, f_t_applied.code)
         tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME))
         return tc, no_tc_err, f_t_applied.code
-
-
-
-
-
-
-
-
-
-

From dfe612e2605ab972631d2e333132c3ca240822d1 Mon Sep 17 00:00:00 2001
From: Amir Mir <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 11:49:37 +0200
Subject: [PATCH 03/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 3

---
 libsa4py/cst_pipeline.py     |  6 +++---
 libsa4py/cst_transformers.py | 17 +++++++++++++----
 libsa4py/type_check.py       | 12 ++++++++----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 042cb7b..bad9367 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -318,7 +318,7 @@ def process_file(self, f: str, f_d_repr: dict):
             delete_tmp_file(tmp_f)
 
     def run(self, jobs: int):
-        merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
+        merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json"))
         not_tced_src_f: List[Tuple[str, dict]] = []
         for p, p_v in list(merged_projects['projects'].items()):
             for f, f_v in p_v['src_files'].items():
@@ -406,7 +406,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 for p_n, p_t in fn["params"].items():
                     if p_t != "":
                         print(f"Type-checking function parameter {p_n} with annotation {p_t}")
-                        f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = ""
                         tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
                             return f_code, no_tc_err
@@ -420,7 +420,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 for fn_v, fn_v_t in fn['variables'].items():
                     if fn_v_t != "":
                         print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
-                        f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = ""
                         tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
                             return f_code, no_tc_err
diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 23f18b8..a5d4558 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -1011,6 +1011,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                                   updated_node: cst.SimpleStatementLine):
 
         # Untyped variables
+        t = None
         if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
                 target=match.DoNotCare())])])):
             if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
@@ -1050,10 +1051,18 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         annotation=t_annot_node,
                         equal=original_node.body[0].equal)])
             else:
-                return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
-                                                                                            whitespace_before_equal=original_node.body[0].equal.whitespace_before,
-                                                                                            whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
-                                                                  value=original_node.body[0].value)])
+                try:
+                    return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
+                                                                                                whitespace_before_equal=original_node.body[0].equal.whitespace_before,
+                                                                                                whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
+                                                                    value=original_node.body[0].value)])
+                except AttributeError:
+                    print("AT", original_node.body[0])
+                    return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
+                                                                                                whitespace_before_equal=original_node.body[0].equal.whitespace_before,
+                                                                                                whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
+                                                                    value=original_node.body[0].value)])
+                    
 
         return original_node
 
diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py
index ab97d2d..abcfe28 100644
--- a/libsa4py/type_check.py
+++ b/libsa4py/type_check.py
@@ -166,8 +166,12 @@ def _report_errors(self, parsed_result):
 
 
 def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]:
-    no_t_err = tc.heavy_assess(f_path)
-    if no_t_err is not None:
-        return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs)
-    else:
+    try:
+        no_t_err = tc.heavy_assess(f_path)
+        if no_t_err is not None:
+            return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs)
+        else:
+            return False, None
+    except IndexError:
+        print(f"f: {f_path} - No output from Mypy!")
         return False, None

From d2a41f222c7ab64098bcb32645d26f9ed7ddde19 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 13:56:42 +0200
Subject: [PATCH 04/31] Fixed AttributeError when removing annotations for
 unintialized vars

---
 libsa4py/cst_transformers.py            | 18 ++++++------------
 tests/examples/type_apply_typed_ex.json |  3 ++-
 tests/test_type_apply.py                |  2 ++
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index a5d4558..7576f65 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -1034,7 +1034,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                                             whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))]
                     )
         # Typed variables
-        elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare())])):
+        elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare(),
+                                                                                          value=match.MatchIfTrue(lambda v: v is not None))])):
             if match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Name(value=match.DoNotCare()))])):
                 t = self.__get_var_type_an_assign(original_node.body[0].target.value)
             elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.Attribute(value=match.Name(value=match.DoNotCare()),
@@ -1051,17 +1052,10 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         annotation=t_annot_node,
                         equal=original_node.body[0].equal)])
             else:
-                try:
-                    return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
-                                                                                                whitespace_before_equal=original_node.body[0].equal.whitespace_before,
-                                                                                                whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
-                                                                    value=original_node.body[0].value)])
-                except AttributeError:
-                    print("AT", original_node.body[0])
-                    return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
-                                                                                                whitespace_before_equal=original_node.body[0].equal.whitespace_before,
-                                                                                                whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
-                                                                    value=original_node.body[0].value)])
+                return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
+                                                                                            whitespace_before_equal=original_node.body[0].equal.whitespace_before,
+                                                                                            whitespace_after_equal=original_node.body[0].equal.whitespace_after)],
+                                                                value=original_node.body[0].value)])
                     
 
         return original_node
diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json
index 85bd099..f40b525 100644
--- a/tests/examples/type_apply_typed_ex.json
+++ b/tests/examples/type_apply_typed_ex.json
@@ -8,7 +8,8 @@
     "variables": {
         "a": "",
         "l": "",
-        "c": ""
+        "c": "",
+        "h": ""
     },
     "mod_var_occur": {
         "a": [
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 611258d..427a0ab 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -84,6 +84,7 @@ def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[bu
 test_file_typed = """a: int = 12
 l: List[int] = [1,2,3]
 c = 2.71
+h: dict
 def foo(x: int, y: int) -> int:
     z: int = x + y
     return z
@@ -100,6 +101,7 @@ def delta(n: int) -> List[float]:
 test_file_typed_exp = """a = 12
 l = [1,2,3]
 c = 2.71
+h: dict
 def foo(x, y):
     z = x + y
     return z

From a1703082f40f0941a0d2f8878753ad77a529caf6 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 14:17:59 +0200
Subject: [PATCH 05/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 4

---
 libsa4py/cst_pipeline.py | 42 ++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index bad9367..74f511c 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -311,8 +311,9 @@ def process_file(self, f: str, f_d_repr: dict):
             return
         else:
             tmp_f = create_tmp_file(".py")
-            f_tc_code, tc_errs = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err, tmp_f)
-            print(f"F: {Path(f).name} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs}")
+            f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err,
+                                                                                  tmp_f)
+            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r}")
             # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
             # write_file(join(self.projects_path, f), f_tc_code)
             delete_tmp_file(tmp_f)
@@ -329,19 +330,22 @@ def run(self, jobs: int):
                                                                  for f, f_d in not_tced_src_f)
 
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
-                                      f_out_temp: NamedTemporaryFile) -> Tuple[str, int]:
+                                      f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:
 
         out_f_code: str = ""
+        type_annots_removed: List[str] = []
         for m_v, m_v_t in f_d_repr['variables'].items():
             if m_v_t != "":
                 print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}")
                 f_d_repr['variables'][m_v] = ""
                 tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                 if tc:
-                    return f_code, no_tc_err
+                    type_annots_removed.append(m_v_t)
+                    return f_code, no_tc_err, type_annots_removed
                 elif no_tc_err < init_no_tc_err:
                     out_f_code = f_code
                     init_no_tc_err = no_tc_err
+                    type_annots_removed.append(m_v_t)
                 elif no_tc_err == init_no_tc_err:
                     f_d_repr['variables'][m_v] = m_v_t
 
@@ -352,10 +356,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     f_d_repr['funcs'][i]['params'][p_n] = ""
                     tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code, no_tc_err
+                        type_annots_removed.append(p_t)
+                        return f_code, no_tc_err, type_annots_removed
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
                         init_no_tc_err = no_tc_err
+                        type_annots_removed.append(p_t)
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['funcs'][i]['params'][p_n] = p_t
 
@@ -365,10 +371,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     f_d_repr['funcs'][i]['variables'][fn_v] = ""
                     tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code, no_tc_err
+                        type_annots_removed.append(fn_v_t)
+                        return f_code, no_tc_err, type_annots_removed
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
                         init_no_tc_err = no_tc_err
+                        type_annots_removed.append(fn_v_t)
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
 
@@ -379,10 +387,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 f_d_repr['funcs'][i]['ret_type'] = ""
                 tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                 if tc:
-                    return f_code, no_tc_err
+                    type_annots_removed.append(org_t)
+                    return f_code, no_tc_err, type_annots_removed
                 elif no_tc_err < init_no_tc_err:
                     out_f_code = f_code
                     init_no_tc_err = no_tc_err
+                    type_annots_removed.append(org_t)
                 elif no_tc_err == init_no_tc_err:
                     f_d_repr['funcs'][i]['ret_type'] = org_t
 
@@ -394,10 +404,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     f_d_repr['classes'][c_i]['variables'][c_v] = ""
                     tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code, no_tc_err
+                        type_annots_removed.append(c_v_t)
+                        return f_code, no_tc_err, type_annots_removed
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
                         init_no_tc_err = no_tc_err
+                        type_annots_removed.append(c_v_t)
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
 
@@ -409,10 +421,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = ""
                         tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
-                            return f_code, no_tc_err
+                            type_annots_removed.append(p_t)
+                            return f_code, no_tc_err, type_annots_removed
                         elif no_tc_err < init_no_tc_err:
                             out_f_code = f_code
                             init_no_tc_err = no_tc_err
+                            type_annots_removed.append(p_t)
                         elif no_tc_err == init_no_tc_err:
                             f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
 
@@ -423,10 +437,12 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = ""
                         tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                         if tc:
-                            return f_code, no_tc_err
+                            type_annots_removed.append(fn_v_t)
+                            return f_code, no_tc_err, type_annots_removed
                         elif no_tc_err < init_no_tc_err:
                             out_f_code = f_code
                             init_no_tc_err = no_tc_err
+                            type_annots_removed.append(fn_v_t)
                         elif no_tc_err == init_no_tc_err:
                             f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
 
@@ -438,14 +454,16 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = ""
                     tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
                     if tc:
-                        return f_code, no_tc_err
+                        type_annots_removed.append(org_t)
+                        return f_code, no_tc_err, type_annots_removed
                     elif no_tc_err < init_no_tc_err:
                         out_f_code = f_code
                         init_no_tc_err = no_tc_err
+                        type_annots_removed.append(org_t)
                     elif no_tc_err == init_no_tc_err:
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
 
-        return out_f_code, init_no_tc_err
+        return out_f_code, init_no_tc_err, type_annots_removed
 
     def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile):
         f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr,

From f7b286d911a4cc0fc6a7239403f6ea0d29e31123 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 14:43:09 +0200
Subject: [PATCH 06/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 4

---
 libsa4py/cst_pipeline.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 74f511c..0d652f2 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from datetime import timedelta
 from joblib import delayed
+from multiprocessing import Manager
 from dpu_utils.utils.dataloading import load_jsonl_gz
 from libsa4py.cst_extractor import Extractor
 from libsa4py.cst_transformers import TypeApplier
@@ -301,7 +302,7 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path
         self.output_path = output_path
         self.apply_nlp = apply_nlp
 
-    def process_file(self, f: str, f_d_repr: dict):
+    def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
         f_read = read_file(join(self.projects_path, f))
         # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on.
         init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f),
@@ -313,7 +314,10 @@ def process_file(self, f: str, f_d_repr: dict):
             tmp_f = create_tmp_file(".py")
             f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err,
                                                                                   tmp_f)
-            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r}")
+            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
+             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
+            tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
+                         "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
             # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
             # write_file(join(self.projects_path, f), f_tc_code)
             delete_tmp_file(tmp_f)
@@ -326,9 +330,13 @@ def run(self, jobs: int):
                 if not f_v['tc']:
                     not_tced_src_f.append((f, f_v))
 
-        ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d) \
+        manager = Manager()
+        tc_res = manager.dict()
+        ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \
                                                                  for f, f_d in not_tced_src_f)
 
+        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res)
+
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
                                       f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:
 

From 715430a8752d7d5018aba6ca8d9c5d78a24b1825 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 1 Jul 2021 15:32:26 +0200
Subject: [PATCH 07/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 5

---
 libsa4py/cst_pipeline.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 0d652f2..ac33ba7 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -311,16 +311,18 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
         if init_tc == False and init_no_tc_err is None:
             return
         else:
-            tmp_f = create_tmp_file(".py")
-            f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err,
-                                                                                  tmp_f)
-            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
-             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
-            tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
-                         "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
-            # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
-            # write_file(join(self.projects_path, f), f_tc_code)
-            delete_tmp_file(tmp_f)
+            # Only files with type annotations
+            if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
+                tmp_f = create_tmp_file(".py")
+                f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err,
+                                                                                      tmp_f)
+                print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
+                 total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
+                tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
+                             "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
+                # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
+                # write_file(join(self.projects_path, f), f_tc_code)
+                delete_tmp_file(tmp_f)
 
     def run(self, jobs: int):
         merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json"))
@@ -335,7 +337,7 @@ def run(self, jobs: int):
         ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \
                                                                  for f, f_d in not_tced_src_f)
 
-        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res)
+        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy())
 
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
                                       f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:

From 8f46016dfff3a8b629dc49582ccd27f7b01aea9b Mon Sep 17 00:00:00 2001
From: Amir Mir <mir-am@hotmail.com>
Date: Fri, 2 Jul 2021 23:04:21 +0200
Subject: [PATCH 08/31] Removing type annotations that do not type check by
 mypy [WIP] - Part 6

---
 libsa4py/cst_pipeline.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 0d652f2..c2edcc3 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -314,28 +314,28 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
             tmp_f = create_tmp_file(".py")
             f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, init_no_tc_err,
                                                                                   tmp_f)
-            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
-             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
+            print(f"F: {f} | init_tc_errors: {init_no_tc_err} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
             tc_res[f] = {"init_tc_errs": init_no_tc_err, "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
                          "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
             # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
-            # write_file(join(self.projects_path, f), f_tc_code)
+            write_file(join(self.projects_path, f), f_tc_code)
             delete_tmp_file(tmp_f)
 
     def run(self, jobs: int):
-        merged_projects = load_json(join(self.processed_projects_path, "merged_512_projects.json"))
+        merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
         not_tced_src_f: List[Tuple[str, dict]] = []
         for p, p_v in list(merged_projects['projects'].items()):
             for f, f_v in p_v['src_files'].items():
                 if not f_v['tc']:
                     not_tced_src_f.append((f, f_v))
 
+        #not_tced_src_f = not_tced_src_f[:250]
         manager = Manager()
         tc_res = manager.dict()
         ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \
                                                                  for f, f_d in not_tced_src_f)
 
-        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res)
+        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy())
 
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
                                       f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:

From c63897306b1cc5a39791135732526ec5b6050b8b Mon Sep 17 00:00:00 2001
From: Amir Mir <mir-am@hotmail.com>
Date: Mon, 12 Jul 2021 11:19:04 +0200
Subject: [PATCH 09/31] Improvements to the TypeAnnotationsRemoval pipeline

---
 libsa4py/cst_pipeline.py | 190 ++++++++++++++++++++++++---------------
 1 file changed, 119 insertions(+), 71 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 2b1ef84..692387e 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -313,8 +313,8 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
         # Only files with type annotations
         if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
             try:
-                f_read = read_file(join(self.projects_path, f))
                 tmp_f = create_tmp_file(".py")
+                f_read = read_file(join(self.projects_path, f))
                 f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
                                                                                         tmp_f)
                 print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
@@ -337,81 +337,113 @@ def run(self, jobs: int):
                 if not f_v['tc'][0] and f_v['tc'] != [False, None]:
                     not_tced_src_f.append((f, f_v))
 
-        print("L:", len(not_tced_src_f))
-        #not_tced_src_f = not_tced_src_f[:250]
+        del merged_projects
+        # not_tced_src_f = not_tced_src_f[:250]
+        # print("L:", len(not_tced_src_f))
         manager = Manager()
         tc_res = manager.dict()
         ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \
                                                                  for f, f_d in not_tced_src_f)
 
-        save_json(join(self.processed_projects_path, "tc_ta_results.json"), tc_res.copy())
+        save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy())
 
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
                                       f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:
 
-        out_f_code: str = ""
         type_annots_removed: List[str] = []
+
+        def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
+            tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+            if no_tc_err is not None:
+                if tc:
+                    type_annots_removed.append(org_gt)
+                elif no_tc_err < init_no_tc_err:
+                    out_f_code = f_code
+                    init_no_tc_err = no_tc_err
+                    type_annots_removed.append(org_gt)
+                elif no_tc_err == init_no_tc_err:
+                    org_gt_d = org_gt
+
+            return tc, no_tc_err, f_code
+
+        out_f_code: str = ""
         for m_v, m_v_t in f_d_repr['variables'].items():
             if m_v_t != "":
                 print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}")
                 f_d_repr['variables'][m_v] = ""
-                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                # if tc:
+                #     type_annots_removed.append(m_v_t)
+                #     return f_code, no_tc_err, type_annots_removed
+                # elif no_tc_err < init_no_tc_err:
+                #     out_f_code = f_code
+                #     init_no_tc_err = no_tc_err
+                #     type_annots_removed.append(m_v_t)
+                # elif no_tc_err == init_no_tc_err:
+                #     f_d_repr['variables'][m_v] = m_v_t
+                tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, m_v_t,
+                                                  f_d_repr['variables'][m_v])
                 if tc:
-                    type_annots_removed.append(m_v_t)
                     return f_code, no_tc_err, type_annots_removed
-                elif no_tc_err < init_no_tc_err:
-                    out_f_code = f_code
-                    init_no_tc_err = no_tc_err
-                    type_annots_removed.append(m_v_t)
-                elif no_tc_err == init_no_tc_err:
-                    f_d_repr['variables'][m_v] = m_v_t
 
         for i, fn in enumerate(f_d_repr['funcs']):
             for p_n, p_t in fn['params'].items():
                 if p_t != "":
                     print(f"Type-checking function parameter {p_n} with annotation {p_t}")
                     f_d_repr['funcs'][i]['params'][p_n] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
-                    if tc:
-                        type_annots_removed.append(p_t)
-                        return f_code, no_tc_err, type_annots_removed
-                    elif no_tc_err < init_no_tc_err:
-                        out_f_code = f_code
-                        init_no_tc_err = no_tc_err
-                        type_annots_removed.append(p_t)
-                    elif no_tc_err == init_no_tc_err:
-                        f_d_repr['funcs'][i]['params'][p_n] = p_t
+                    # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # if tc:
+                    #     type_annots_removed.append(p_t)
+                    #     return f_code, no_tc_err, type_annots_removed
+                    # elif no_tc_err < init_no_tc_err:
+                    #     out_f_code = f_code
+                    #     init_no_tc_err = no_tc_err
+                    #     type_annots_removed.append(p_t)
+                    # elif no_tc_err == init_no_tc_err:
+                    #     f_d_repr['funcs'][i]['params'][p_n] = p_t
+                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t,
+                                                      f_d_repr['funcs'][i]['params'][p_n])
+                if tc:
+                    return f_code, no_tc_err, type_annots_removed
 
             for fn_v, fn_v_t in fn['variables'].items():
                 if fn_v_t != "":
                     print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
                     f_d_repr['funcs'][i]['variables'][fn_v] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # if tc:
+                    #     type_annots_removed.append(fn_v_t)
+                    #     return f_code, no_tc_err, type_annots_removed
+                    # elif no_tc_err < init_no_tc_err:
+                    #     out_f_code = f_code
+                    #     init_no_tc_err = no_tc_err
+                    #     type_annots_removed.append(fn_v_t)
+                    # elif no_tc_err == init_no_tc_err:
+                    #     f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
+                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t,
+                                                      f_d_repr['funcs'][i]['variables'][fn_v])
                     if tc:
-                        type_annots_removed.append(fn_v_t)
                         return f_code, no_tc_err, type_annots_removed
-                    elif no_tc_err < init_no_tc_err:
-                        out_f_code = f_code
-                        init_no_tc_err = no_tc_err
-                        type_annots_removed.append(fn_v_t)
-                    elif no_tc_err == init_no_tc_err:
-                        f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
 
             # The return type for module-level functions
             if f_d_repr['funcs'][i]['ret_type'] != "":
                 org_t = f_d_repr['funcs'][i]['ret_type']
                 print(f"Type-checking function {f_d_repr['funcs'][i]['name']} return with {org_t}")
                 f_d_repr['funcs'][i]['ret_type'] = ""
-                tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                # if tc:
+                #     type_annots_removed.append(org_t)
+                #     return f_code, no_tc_err, type_annots_removed
+                # elif no_tc_err < init_no_tc_err:
+                #     out_f_code = f_code
+                #     init_no_tc_err = no_tc_err
+                #     type_annots_removed.append(org_t)
+                # elif no_tc_err == init_no_tc_err:
+                #     f_d_repr['funcs'][i]['ret_type'] = org_t
+                tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t,
+                                                  f_d_repr['funcs'][i]['ret_type'])
                 if tc:
-                    type_annots_removed.append(org_t)
                     return f_code, no_tc_err, type_annots_removed
-                elif no_tc_err < init_no_tc_err:
-                    out_f_code = f_code
-                    init_no_tc_err = no_tc_err
-                    type_annots_removed.append(org_t)
-                elif no_tc_err == init_no_tc_err:
-                    f_d_repr['funcs'][i]['ret_type'] = org_t
 
         # The type of class-level vars
         for c_i, c in enumerate(f_d_repr['classes']):
@@ -419,16 +451,20 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                 if c_v_t != "":
                     print(f"Type checking class variable {c_v} with annotation {c_v_t}")
                     f_d_repr['classes'][c_i]['variables'][c_v] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # if tc:
+                    #     type_annots_removed.append(c_v_t)
+                    #     return f_code, no_tc_err, type_annots_removed
+                    # elif no_tc_err < init_no_tc_err:
+                    #     out_f_code = f_code
+                    #     init_no_tc_err = no_tc_err
+                    #     type_annots_removed.append(c_v_t)
+                    # elif no_tc_err == init_no_tc_err:
+                    #     f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
+                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, c_v_t,
+                                                      f_d_repr['classes'][c_i]['variables'][c_v])
                     if tc:
-                        type_annots_removed.append(c_v_t)
                         return f_code, no_tc_err, type_annots_removed
-                    elif no_tc_err < init_no_tc_err:
-                        out_f_code = f_code
-                        init_no_tc_err = no_tc_err
-                        type_annots_removed.append(c_v_t)
-                    elif no_tc_err == init_no_tc_err:
-                        f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
 
             # The type of arguments for class-level functions
             for fn_i, fn in enumerate(c['funcs']):
@@ -436,32 +472,40 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     if p_t != "":
                         print(f"Type-checking function parameter {p_n} with annotation {p_t}")
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = ""
-                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                        # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                        # if tc:
+                        #     type_annots_removed.append(p_t)
+                        #     return f_code, no_tc_err, type_annots_removed
+                        # elif no_tc_err < init_no_tc_err:
+                        #     out_f_code = f_code
+                        #     init_no_tc_err = no_tc_err
+                        #     type_annots_removed.append(p_t)
+                        # elif no_tc_err == init_no_tc_err:
+                        #     f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
+                        tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t,
+                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
                         if tc:
-                            type_annots_removed.append(p_t)
                             return f_code, no_tc_err, type_annots_removed
-                        elif no_tc_err < init_no_tc_err:
-                            out_f_code = f_code
-                            init_no_tc_err = no_tc_err
-                            type_annots_removed.append(p_t)
-                        elif no_tc_err == init_no_tc_err:
-                            f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
 
                 # The type of local variables for class-level functions
                 for fn_v, fn_v_t in fn['variables'].items():
                     if fn_v_t != "":
                         print(f"Type-checking function variable {fn_v} with annotation {fn_v_t}")
                         f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = ""
-                        tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                        # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                        # if tc:
+                        #     type_annots_removed.append(fn_v_t)
+                        #     return f_code, no_tc_err, type_annots_removed
+                        # elif no_tc_err < init_no_tc_err:
+                        #     out_f_code = f_code
+                        #     init_no_tc_err = no_tc_err
+                        #     type_annots_removed.append(fn_v_t)
+                        # elif no_tc_err == init_no_tc_err:
+                        #     f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
+                        tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t,
+                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
                         if tc:
-                            type_annots_removed.append(fn_v_t)
                             return f_code, no_tc_err, type_annots_removed
-                        elif no_tc_err < init_no_tc_err:
-                            out_f_code = f_code
-                            init_no_tc_err = no_tc_err
-                            type_annots_removed.append(fn_v_t)
-                        elif no_tc_err == init_no_tc_err:
-                            f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
 
                 # The return type for class-level functions
                 if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "":
@@ -469,16 +513,20 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
                     print(
                         f"Annotating function {f_d_repr['classes'][c_i]['funcs'][fn_i]['name']} return with type {org_t}")
                     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = ""
-                    tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+                    # if tc:
+                    #     type_annots_removed.append(org_t)
+                    #     return f_code, no_tc_err, type_annots_removed
+                    # elif no_tc_err < init_no_tc_err:
+                    #     out_f_code = f_code
+                    #     init_no_tc_err = no_tc_err
+                    #     type_annots_removed.append(org_t)
+                    # elif no_tc_err == init_no_tc_err:
+                    #     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
+                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t,
+                                                      f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
                     if tc:
-                        type_annots_removed.append(org_t)
                         return f_code, no_tc_err, type_annots_removed
-                    elif no_tc_err < init_no_tc_err:
-                        out_f_code = f_code
-                        init_no_tc_err = no_tc_err
-                        type_annots_removed.append(org_t)
-                    elif no_tc_err == init_no_tc_err:
-                        f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
 
         return out_f_code, init_no_tc_err, type_annots_removed
 

From 7aa76bdd7b359659c64fd0879afc57a667a5e82a Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 12 Jul 2021 11:32:04 +0200
Subject: [PATCH 10/31] Improve type annotation removal code

---
 libsa4py/cst_pipeline.py | 46 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 692387e..e015697 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -352,16 +352,16 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
 
         type_annots_removed: List[str] = []
 
-        def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
+        def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
             tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
             if no_tc_err is not None:
                 if tc:
                     type_annots_removed.append(org_gt)
-                elif no_tc_err < init_no_tc_err:
-                    out_f_code = f_code
-                    init_no_tc_err = no_tc_err
+                elif no_tc_err < curr_no_tc_err:
+                    curr_f_code = f_code
+                    curr_no_tc_err = no_tc_err
                     type_annots_removed.append(org_gt)
-                elif no_tc_err == init_no_tc_err:
+                elif no_tc_err == curr_no_tc_err:
                     org_gt_d = org_gt
 
             return tc, no_tc_err, f_code
@@ -381,8 +381,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(m_v_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['variables'][m_v] = m_v_t
-                tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, m_v_t,
-                                                  f_d_repr['variables'][m_v])
+                tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t,
+                                                      f_d_repr['variables'][m_v])
                 if tc:
                     return f_code, no_tc_err, type_annots_removed
 
@@ -401,10 +401,10 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(p_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['params'][p_n] = p_t
-                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t,
-                                                      f_d_repr['funcs'][i]['params'][p_n])
-                if tc:
-                    return f_code, no_tc_err, type_annots_removed
+                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
+                                                          f_d_repr['funcs'][i]['params'][p_n])
+                    if tc:
+                        return f_code, no_tc_err, type_annots_removed
 
             for fn_v, fn_v_t in fn['variables'].items():
                 if fn_v_t != "":
@@ -420,8 +420,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(fn_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
-                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t,
-                                                      f_d_repr['funcs'][i]['variables'][fn_v])
+                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
+                                                          f_d_repr['funcs'][i]['variables'][fn_v])
                     if tc:
                         return f_code, no_tc_err, type_annots_removed
 
@@ -440,8 +440,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(org_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['funcs'][i]['ret_type'] = org_t
-                tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t,
-                                                  f_d_repr['funcs'][i]['ret_type'])
+                tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
+                                                      f_d_repr['funcs'][i]['ret_type'])
                 if tc:
                     return f_code, no_tc_err, type_annots_removed
 
@@ -461,8 +461,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(c_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
-                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, c_v_t,
-                                                      f_d_repr['classes'][c_i]['variables'][c_v])
+                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t,
+                                                          f_d_repr['classes'][c_i]['variables'][c_v])
                     if tc:
                         return f_code, no_tc_err, type_annots_removed
 
@@ -482,8 +482,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(p_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
-                        tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, p_t,
-                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
+                        tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
+                                                              f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
                         if tc:
                             return f_code, no_tc_err, type_annots_removed
 
@@ -502,8 +502,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(fn_v_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
-                        tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, fn_v_t,
-                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
+                        tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
+                                                              f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
                         if tc:
                             return f_code, no_tc_err, type_annots_removed
 
@@ -523,8 +523,8 @@ def remove_ta(init_no_tc_err: int, out_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(org_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
-                    tc, no_tc_err, f_code = remove_ta(init_no_tc_err, out_f_code, org_t,
-                                                      f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
+                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
+                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
                     if tc:
                         return f_code, no_tc_err, type_annots_removed
 

From 17f8bccad42539c7a2d68e4a8a73933869505b78 Mon Sep 17 00:00:00 2001
From: Amir Mir <mir-am@hotmail.com>
Date: Mon, 19 Jul 2021 10:22:20 +0200
Subject: [PATCH 11/31] Improve TypeApplier by matching functions by line and
 column no. & matching class names' QN using IN operator

---
 libsa4py/cst_transformers.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 7576f65..d531fd9 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -859,7 +859,8 @@ class TypeApplier(cst.CSTTransformer):
     Specifically, it applies the type of arguments, return types, and variables' type.
     """
 
-    METADATA_DEPENDENCIES = (cst.metadata.ScopeProvider, cst.metadata.QualifiedNameProvider)
+    METADATA_DEPENDENCIES = (cst.metadata.ScopeProvider, cst.metadata.QualifiedNameProvider,
+                             cst.metadata.PositionProvider)
 
     def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True):
         self.f_processed_dict = f_processeed_dict
@@ -884,8 +885,9 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict:
             fns = self.f_processed_dict['funcs']
 
         for fn in fns:
-            if fn['q_name'] == self.__get_qualified_name(f_node.name) and \
-                    set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
+            # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \
+            #         set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
+            if fn['fn_lc'] == self.__get_line_column_no(f_node):
                 return fn
 
     def __get_fn_param_type(self, param_name: str):
@@ -899,7 +901,8 @@ def __get_fn_param_type(self, param_name: str):
 
     def __get_cls(self, cls: cst.ClassDef) -> dict:
         for c in self.f_processed_dict['classes']:
-            if c['q_name'] == self.__get_qualified_name(cls.name):
+            q = self.__get_qualified_name(cls.name)
+            if c['q_name'] in self.__get_qualified_name(cls.name):
                 return c
 
     def __get_fn_vars(self, var_name: str) -> dict:
@@ -1130,6 +1133,10 @@ def __get_qualified_name(self, node) -> Optional[str]:
         q_name = list(self.get_metadata(cst.metadata.QualifiedNameProvider, node))
         return q_name[0].name if len(q_name) != 0 else None
 
+    def __get_line_column_no(self, node) -> List[List[int]]:
+        lc = self.get_metadata(cst.metadata.PositionProvider, node)
+        return [[lc.start.line, lc.start.column], [lc.end.line, lc.end.column]]
+
     def resolve_type_alias(self, t: str):
         type_aliases = {'^{}$|^Dict$|(?<=.*)Dict\[\](?<=.*)|(?<=.*)Dict\[Any, *?Any\](?=.*)|^Dict\[unknown, *Any\]$': 'dict',
                         '^Set$|(?<=.*)Set\[\](?<=.*)|^Set\[Any\]$': 'set',

From 221c14c5c97eb1c90fb65bdd1eee9308ea531e37 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 19 Jul 2021 11:23:22 +0200
Subject: [PATCH 12/31] Fix unit tests for TypeApplier when matching functions
 based on line and column no.

---
 libsa4py/cst_transformers.py            |   6 +-
 tests/examples/type_apply_ex.json       | 909 ++++++++++++++----------
 tests/examples/type_apply_typed_ex.json |   8 +-
 tests/test_type_apply.py                |  10 +-
 4 files changed, 551 insertions(+), 382 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index d531fd9..53e5863 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -887,7 +887,7 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict:
         for fn in fns:
             # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \
             #         set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
-            if fn['fn_lc'] == self.__get_line_column_no(f_node):
+            if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node):
                 return fn
 
     def __get_fn_param_type(self, param_name: str):
@@ -1133,9 +1133,9 @@ def __get_qualified_name(self, node) -> Optional[str]:
         q_name = list(self.get_metadata(cst.metadata.QualifiedNameProvider, node))
         return q_name[0].name if len(q_name) != 0 else None
 
-    def __get_line_column_no(self, node) -> List[List[int]]:
+    def __get_line_column_no(self, node) -> Tuple[int, int]:
         lc = self.get_metadata(cst.metadata.PositionProvider, node)
-        return [[lc.start.line, lc.start.column], [lc.end.line, lc.end.column]]
+        return lc.start.line, lc.end.line
 
     def resolve_type_alias(self, t: str):
         type_aliases = {'^{}$|^Dict$|(?<=.*)Dict\[\](?<=.*)|(?<=.*)Dict\[Any, *?Any\](?=.*)|^Dict\[unknown, *Any\]$': 'dict',
diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json
index 1727f94..5976215 100644
--- a/tests/examples/type_apply_ex.json
+++ b/tests/examples/type_apply_ex.json
@@ -1,417 +1,584 @@
 {
-  "tests/examples": {
-    "src_files": {
-      "type_apply.py": {
-    "untyped_seq": "from typing import Tuple , Dict , List , Literal [EOL] from collections import defaultdict [EOL] import pandas [EOL] import pathlib [EOL] import builtins [EOL] import collections [EOL] import typing [EOL] from pathlib import Path [EOL] x = [number] [EOL] l = [ ( [number] , [number] ) ] [EOL] c = defaultdict ( int ) [EOL] df = pd . DataFrame ( [ [number] , [number] ] ) [EOL] dff = pd . DataFrame ( [ [number] , [number] ] ) [EOL] lit = [string] [EOL] class Foo : [EOL] foo_v = [string] [EOL] class Delta : [EOL] foo_d = [string] [EOL] foo_p = Path ( [string] ) [EOL] def __init__ ( ) : [EOL] def foo_inner ( c , d ) : [EOL] pass [EOL] def foo_fn ( self , y ) : [EOL] def foo_inner ( a , b , c , d ) : [EOL] pass [EOL] d = { [string] : True } [EOL] return d [EOL] @ event . getter def get_e ( self ) : [EOL] return Foo . foo_v [EOL] @ event . setter def get_e ( self , y ) : [EOL] Foo . foo_v = y [EOL] return Foo . foo_v [EOL] foo_v = [string] [EOL] def Bar ( x = [ [string] , [string] ] ) : [EOL] v = x [EOL] l = lambda e : e + [number] [EOL] return v [EOL]",
-    "typed_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 $typing.List[typing.Tuple[builtins.int,builtins.int]]$ 0 0 0 0 0 0 0 0 0 $collections.defaultdict$ 0 0 0 0 0 0 $pandas.DataFrame$ 0 0 0 0 0 0 0 0 0 0 0 0 $typing.List[pandas.arrays.PandasArray]$ 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Literal$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $pathlib.Path$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 $typing.Dict[builtins.str,builtins.bool]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 0 0 0 0 0 0 0 0 $typing.List[builtins.str]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 $typing.List[typing.Tuple[builtins.int,builtins.int]]$ 0 0 0 0 0 0 0 0 0 0 0",
-    "imports": [
-        "Tuple",
-        "Dict",
-        "List",
-        "Literal",
-        "defaultdict",
-        "pandas",
-        "pathlib",
-        "builtins",
-        "collections",
-        "typing",
-        "Path"
-    ],
-    "variables": {
-        "x": "builtins.int",
-        "l": "typing.List[typing.Tuple[builtins.int, builtins.int]]",
-        "c": "collections.defaultdict",
-        "df": "pandas.DataFrame",
-        "dff": "typing.List[pandas.arrays.PandasArray]",
-        "lit": "typing.Literal"
-    },
-    "mod_var_occur": {
-        "x": [
-            [
-                "v",
-                "typing",
-                "List",
-                "builtins",
-                "str",
-                "x"
-            ]
-        ],
-        "l": [
-            [
-                "l",
-                "e",
-                "e"
-            ]
-        ],
-        "c": [],
-        "df": [],
-        "dff": [],
-        "lit": []
-    },
-    "classes": [
-        {
-            "name": "Delta",
-            "q_name": "Foo.Delta",
-            "variables": {
-                "foo_d": ""
-            },
-            "cls_var_occur": {
-                "foo_d": []
-            },
-            "funcs": []
-        },
-        {
-            "name": "Foo",
-            "q_name": "Foo",
-            "variables": {
-                "foo_v": "str",
-                "foo_p": "pathlib.Path"
-            },
-            "cls_var_occur": {
-                "foo_v": [
-                    [
-                        "Foo",
-                        "foo_v"
-                    ],
-                    [
-                        "Foo",
-                        "foo_v",
-                        "y"
-                    ],
-                    [
-                        "Foo",
-                        "foo_v"
-                    ]
+    "tests/examples": {
+        "src_files": {
+            "type_apply.py": {
+                "untyped_seq": "",
+                "typed_seq": "",
+                "imports": [
+                    "pathlib",
+                    "Path"
                 ],
-                "foo_p": []
-            },
-            "funcs": [
-                {
-                    "name": "foo_inner",
-                    "q_name": "Foo.__init__.<locals>.foo_inner",
-                    "fn_lc": [
+                "variables": {
+                    "x": "builtins.int",
+                    "l": "typing.List[typing.Tuple[builtins.int, builtins.int]]",
+                    "c": "collections.defaultdict",
+                    "df": "pandas.DataFrame",
+                    "dff": "typing.List[pandas.arrays.PandasArray]",
+                    "lit": "typing.Literal"
+                },
+                "mod_var_occur": {
+                    "x": [
                         [
-                            21,
-                            8
-                        ],
+                            "v",
+                            "typing",
+                            "List",
+                            "builtins",
+                            "str",
+                            "x"
+                        ]
+                    ],
+                    "l": [
                         [
-                            22,
-                            16
+                            "l",
+                            "e",
+                            "e"
                         ]
                     ],
-                    "params": {
-                        "c": "str",
-                        "d": ""
-                    },
-                    "ret_exprs": [],
-                    "params_occur": {
-                        "c": [],
-                        "d": []
-                    },
-                    "ret_type": "",
-                    "variables": {},
-                    "fn_var_occur": {},
-                    "params_descr": {
-                        "c": "",
-                        "d": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
-                    }
+                    "c": [],
+                    "df": [],
+                    "dff": [],
+                    "lit": []
                 },
-                {
-                    "name": "__init__",
-                    "q_name": "Foo.__init__",
-                    "fn_lc": [
+                "mod_var_ln": {
+                    "x": [
                         [
-                            20,
-                            4
+                            2,
+                            0
                         ],
                         [
-                            22,
-                            16
+                            2,
+                            1
                         ]
                     ],
-                    "params": {"self":  ""},
-                    "ret_exprs": [],
-                    "params_occur": {},
-                    "ret_type": "",
-                    "variables": {"i":  "int"},
-                    "fn_var_occur": {},
-                    "params_descr": {},
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
-                    }
-                },
-                {
-                    "name": "foo_inner",
-                    "q_name": "Foo.foo_fn.<locals>.foo_inner",
-                    "fn_lc": [
+                    "l": [
                         [
-                            24,
-                            8
+                            3,
+                            0
                         ],
                         [
-                            25,
-                            16
+                            3,
+                            1
                         ]
                     ],
-                    "params": {
-                        "a": "",
-                        "b": "",
-                        "c": "",
-                        "d": "",
-                        "args": "",
-                        "kwargs": ""
-                    },
-                    "ret_exprs": [],
-                    "params_occur": {
-                        "a": [],
-                        "b": [],
-                        "c": [],
-                        "d": []
-                    },
-                    "ret_type": "",
-                    "variables": {},
-                    "fn_var_occur": {},
-                    "params_descr": {
-                        "a": "",
-                        "b": "",
-                        "c": "",
-                        "d": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
-                    }
-                },
-                {
-                    "name": "foo_fn",
-                    "q_name": "Foo.foo_fn",
-                    "fn_lc": [
+                    "c": [
                         [
-                            23,
-                            4
+                            4,
+                            0
                         ],
                         [
-                            27,
-                            16
+                            4,
+                            1
                         ]
                     ],
-                    "params": {
-                        "self": "",
-                        "y": ""
-                    },
-                    "ret_exprs": [
-                        "return d"
-                    ],
-                    "params_occur": {
-                        "self": [],
-                        "y": []
-                    },
-                    "ret_type": "typing.Dict[builtins.str, builtins.bool]",
-                    "variables": {
-                        "d": "typing.Dict[builtins.str, builtins.bool]"
-                    },
-                    "fn_var_occur": {
-                        "d": [
-                            [
-                                "d",
-                                "typing",
-                                "Dict",
-                                "builtins",
-                                "str",
-                                "builtins",
-                                "bool",
-                                "True"
-                            ]
-                        ]
-                    },
-                    "params_descr": {
-                        "self": "",
-                        "y": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
-                    }
-                },
-                {
-                    "name": "get_e",
-                    "q_name": "Foo.get_e",
-                    "fn_lc": [
+                    "df": [
                         [
-                            29,
-                            4
+                            5,
+                            0
                         ],
                         [
-                            30,
-                            24
+                            5,
+                            2
                         ]
                     ],
-                    "params": {
-                        "self": ""
-                    },
-                    "ret_exprs": [
-                        "return Foo.foo_v"
-                    ],
-                    "params_occur": {
-                        "self": []
-                    },
-                    "ret_type": "",
-                    "variables": {},
-                    "fn_var_occur": {},
-                    "params_descr": {
-                        "self": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
-                    }
-                },
-                {
-                    "name": "get_e",
-                    "q_name": "Foo.get_e",
-                    "fn_lc": [
+                    "dff": [
                         [
-                            32,
-                            4
+                            6,
+                            0
                         ],
                         [
-                            34,
-                            24
+                            6,
+                            3
                         ]
                     ],
-                    "params": {
-                        "self": "",
-                        "y": "builtins.str"
+                    "lit": [
+                        [
+                            7,
+                            0
+                        ],
+                        [
+                            7,
+                            3
+                        ]
+                    ]
+                },
+                "classes": [
+                    {
+                        "name": "Delta",
+                        "q_name": "Foo.Delta",
+                        "variables": {
+                            "foo_d": ""
+                        },
+                        "cls_var_occur": {
+                            "foo_d": []
+                        },
+                        "cls_var_ln": {
+                            "foo_d": [
+                                [
+                                    11,
+                                    8
+                                ],
+                                [
+                                    11,
+                                    13
+                                ]
+                            ]
+                        },
+                        "funcs": []
                     },
-                    "ret_exprs": [
-                        "return Foo.foo_v"
-                    ],
-                    "params_occur": {
-                        "self": [],
-                        "y": [
-                            [
-                                "Foo",
-                                "foo_v",
-                                "y"
+                    {
+                        "name": "Foo",
+                        "q_name": "Foo",
+                        "variables": {
+                            "foo_v": "str",
+                            "foo_p": "pathlib.Path"
+                        },
+                        "cls_var_occur": {
+                            "foo_v": [
+                                [
+                                    "Foo",
+                                    "foo_v"
+                                ],
+                                [
+                                    "Foo",
+                                    "foo_v",
+                                    "y"
+                                ],
+                                [
+                                    "Foo",
+                                    "foo_v"
+                                ]
+                            ],
+                            "foo_p": []
+                        },
+                        "cls_var_ln": {
+                            "foo_v": [
+                                [
+                                    29,
+                                    4
+                                ],
+                                [
+                                    29,
+                                    9
+                                ]
+                            ],
+                            "foo_p": [
+                                [
+                                    12,
+                                    4
+                                ],
+                                [
+                                    12,
+                                    9
+                                ]
                             ]
+                        },
+                        "funcs": [
+                            {
+                                "name": "foo_inner",
+                                "q_name": "Foo.__init__.<locals>.foo_inner",
+                                "fn_lc": [
+                                    [
+                                        15,
+                                        8
+                                    ],
+                                    [
+                                        16,
+                                        16
+                                    ]
+                                ],
+                                "params": {
+                                    "c": "builtins.str",
+                                    "d": ""
+                                },
+                                "ret_exprs": [],
+                                "params_occur": {
+                                    "c": [],
+                                    "d": []
+                                },
+                                "ret_type": "",
+                                "variables": {},
+                                "fn_var_occur": {},
+                                "fn_var_ln": {},
+                                "params_descr": {
+                                    "c": "",
+                                    "d": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "__init__",
+                                "q_name": "Foo.__init__",
+                                "fn_lc": [
+                                    [
+                                        13,
+                                        4
+                                    ],
+                                    [
+                                        16,
+                                        16
+                                    ]
+                                ],
+                                "params": {
+                                    "self": ""
+                                },
+                                "ret_exprs": [],
+                                "params_occur": {
+                                    "self": []
+                                },
+                                "ret_type": "",
+                                "variables": {
+                                    "i": "builtins.int"
+                                },
+                                "fn_var_occur": {
+                                    "i": []
+                                },
+                                "fn_var_ln": {
+                                    "i": [
+                                        [
+                                            14,
+                                            8
+                                        ],
+                                        [
+                                            14,
+                                            14
+                                        ]
+                                    ]
+                                },
+                                "params_descr": {
+                                    "self": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "foo_inner",
+                                "q_name": "Foo.foo_fn.<locals>.foo_inner",
+                                "fn_lc": [
+                                    [
+                                        18,
+                                        8
+                                    ],
+                                    [
+                                        19,
+                                        16
+                                    ]
+                                ],
+                                "params": {
+                                    "a": "",
+                                    "b": "",
+                                    "c": "",
+                                    "d": "",
+                                    "args": "",
+                                    "kwargs": ""
+                                },
+                                "ret_exprs": [],
+                                "params_occur": {
+                                    "a": [],
+                                    "b": [],
+                                    "c": [],
+                                    "d": [],
+                                    "args": [],
+                                    "kwargs": []
+                                },
+                                "ret_type": "",
+                                "variables": {},
+                                "fn_var_occur": {},
+                                "fn_var_ln": {},
+                                "params_descr": {
+                                    "a": "",
+                                    "b": "",
+                                    "c": "",
+                                    "d": "",
+                                    "args": "",
+                                    "kwargs": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "foo_fn",
+                                "q_name": "Foo.foo_fn",
+                                "fn_lc": [
+                                    [
+                                        17,
+                                        4
+                                    ],
+                                    [
+                                        21,
+                                        16
+                                    ]
+                                ],
+                                "params": {
+                                    "self": "",
+                                    "y": ""
+                                },
+                                "ret_exprs": [
+                                    "return d"
+                                ],
+                                "params_occur": {
+                                    "self": [],
+                                    "y": []
+                                },
+                                "ret_type": "typing.Dict[builtins.str, builtins.bool]",
+                                "variables": {
+                                    "d": "typing.Dict[builtins.str, builtins.bool]"
+                                },
+                                "fn_var_occur": {
+                                    "d": [
+                                        [
+                                            "d",
+                                            "typing",
+                                            "Dict",
+                                            "builtins",
+                                            "str",
+                                            "builtins",
+                                            "bool",
+                                            "True"
+                                        ]
+                                    ]
+                                },
+                                "fn_var_ln": {
+                                    "d": [
+                                        [
+                                            20,
+                                            8
+                                        ],
+                                        [
+                                            20,
+                                            9
+                                        ]
+                                    ]
+                                },
+                                "params_descr": {
+                                    "self": "",
+                                    "y": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "get_e",
+                                "q_name": "Foo.get_e",
+                                "fn_lc": [
+                                    [
+                                        23,
+                                        4
+                                    ],
+                                    [
+                                        24,
+                                        24
+                                    ]
+                                ],
+                                "params": {
+                                    "self": ""
+                                },
+                                "ret_exprs": [
+                                    "return Foo.foo_v"
+                                ],
+                                "params_occur": {
+                                    "self": []
+                                },
+                                "ret_type": "",
+                                "variables": {},
+                                "fn_var_occur": {},
+                                "fn_var_ln": {},
+                                "params_descr": {
+                                    "self": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "get_e",
+                                "q_name": "Foo.get_e",
+                                "fn_lc": [
+                                    [
+                                        26,
+                                        4
+                                    ],
+                                    [
+                                        28,
+                                        24
+                                    ]
+                                ],
+                                "params": {
+                                    "self": "",
+                                    "y": "builtins.str"
+                                },
+                                "ret_exprs": [
+                                    "return Foo.foo_v"
+                                ],
+                                "params_occur": {
+                                    "self": [],
+                                    "y": [
+                                        [
+                                            "Foo",
+                                            "foo_v",
+                                            "y"
+                                        ]
+                                    ]
+                                },
+                                "ret_type": "",
+                                "variables": {
+                                    "foo_v": ""
+                                },
+                                "fn_var_occur": {
+                                    "foo_v": [
+                                        [
+                                            "Foo",
+                                            "foo_v",
+                                            "y"
+                                        ],
+                                        [
+                                            "Foo",
+                                            "foo_v"
+                                        ]
+                                    ]
+                                },
+                                "fn_var_ln": {
+                                    "foo_v": [
+                                        [
+                                            27,
+                                            8
+                                        ],
+                                        [
+                                            27,
+                                            17
+                                        ]
+                                    ]
+                                },
+                                "params_descr": {
+                                    "self": "",
+                                    "y": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            }
                         ]
-                    },
-                    "ret_type": "",
-                    "variables": {
-                        "foo_v": ""
-                    },
-                    "fn_var_occur": {
-                        "foo_v": [
+                    }
+                ],
+                "funcs": [
+                    {
+                        "name": "Bar",
+                        "q_name": "Bar",
+                        "fn_lc": [
                             [
-                                "Foo",
-                                "foo_v",
-                                "y"
+                                30,
+                                0
                             ],
                             [
-                                "Foo",
-                                "foo_v"
+                                33,
+                                12
                             ]
-                        ]
-                    },
-                    "params_descr": {
-                        "self": "",
-                        "y": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
+                        ],
+                        "params": {
+                            "x": "typing.List[builtins.str]",
+                            "c": ""
+                        },
+                        "ret_exprs": [
+                            "return v"
+                        ],
+                        "params_occur": {
+                            "x": [
+                                [
+                                    "v",
+                                    "typing",
+                                    "List",
+                                    "builtins",
+                                    "str",
+                                    "x"
+                                ]
+                            ],
+                            "c": []
+                        },
+                        "ret_type": "typing.List[builtins.str]",
+                        "variables": {
+                            "v": "typing.List[builtins.str]",
+                            "l": ""
+                        },
+                        "fn_var_occur": {
+                            "v": [
+                                [
+                                    "v",
+                                    "typing",
+                                    "List",
+                                    "builtins",
+                                    "str",
+                                    "x"
+                                ]
+                            ],
+                            "l": [
+                                [
+                                    "l",
+                                    "e",
+                                    "e"
+                                ]
+                            ]
+                        },
+                        "fn_var_ln": {
+                            "v": [
+                                [
+                                    31,
+                                    4
+                                ],
+                                [
+                                    31,
+                                    5
+                                ]
+                            ],
+                            "l": [
+                                [
+                                    32,
+                                    4
+                                ],
+                                [
+                                    32,
+                                    5
+                                ]
+                            ]
+                        },
+                        "params_descr": {
+                            "x": "",
+                            "c": ""
+                        },
+                        "docstring": {
+                            "func": null,
+                            "ret": null,
+                            "long_descr": null
+                        }
                     }
-                }
-            ]
-        }
-    ],
-    "funcs": [
-        {
-            "name": "Bar",
-            "q_name": "Bar",
-            "fn_lc": [
-                [
-                    36,
-                    0
                 ],
-                [
-                    39,
-                    12
-                ]
-            ],
-            "params": {
-                "x": "typing.List[builtins.str]",
-                "c": ""
-            },
-            "ret_exprs": [
-                "return v"
-            ],
-            "params_occur": {
-                "x": [
-                    [
-                        "v",
-                        "typing",
-                        "List",
-                        "builtins",
-                        "str",
-                        "x"
-                    ]
-                ]
-            },
-            "ret_type": "typing.List[builtins.str]",
-            "variables": {
-                "v": "typing.List[builtins.str]",
-                "l": ""
-            },
-            "fn_var_occur": {
-                "v": [
-                    [
-                        "v",
-                        "typing",
-                        "List",
-                        "builtins",
-                        "str",
-                        "x"
-                    ]
+                "set": null,
+                "tc": [
+                    false,
+                    null
                 ],
-                "l": [
-                    [
-                        "l",
-                        "e",
-                        "e"
-                    ]
-                ]
-            },
-            "params_descr": {
-                "x": ""
-            },
-            "docstring": {
-                "func": null,
-                "ret": null,
-                "long_descr": null
+                "no_types_annot": {
+                    "U": 15,
+                    "D": 14,
+                    "I": 0
+                },
+                "type_annot_cove": 0.48
             }
         }
-    ],
-    "set": null,
-    "tc": false,
-    "no_types_annot": {
-        "U": 12,
-        "D": 13,
-        "I": 0
-    },
-    "type_annot_cove": 0.52
-}
     }
-  }
 }
\ No newline at end of file
diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json
index f40b525..374b4a0 100644
--- a/tests/examples/type_apply_typed_ex.json
+++ b/tests/examples/type_apply_typed_ex.json
@@ -42,11 +42,11 @@
                     "q_name": "Bar.__init__",
                     "fn_lc": [
                         [
-                            12,
+                            11,
                             4
                         ],
                         [
-                            14,
+                            13,
                             18
                         ]
                     ],
@@ -111,11 +111,11 @@
                     "q_name": "Bar.delta",
                     "fn_lc": [
                         [
-                            15,
+                            14,
                             4
                         ],
                         [
-                            16,
+                            15,
                             25
                         ]
                     ],
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 427a0ab..60c777c 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -59,8 +59,8 @@ class Delta:
         foo_d = 'Hello, Delta!'
     foo_p: pathlib.Path = Path('/home/foo/bar')
     def __init__(self):
-        self.i: int = 10
-        def foo_inner(c: str, d=lambda a,b: a == b):
+        self.i: builtins.int = 10
+        def foo_inner(c: builtins.str, d=lambda a,b: a == b):
             pass
     def foo_fn(self, y)-> typing.Dict[builtins.str, builtins.bool]:
         def foo_inner(a, b, c, d, *args, **kwargs):
@@ -131,8 +131,10 @@ def setUpClass(cls):
         write_file('./tmp_ta/type_apply_typed.py', test_file_typed)
 
         # from libsa4py.cst_extractor import Extractor
-        # # save_json('./tmp_ta/type_apply_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply.py')).to_dict())
-        # save_json('./tmp_ta/type_apply_typed_ex.json', Extractor.extract(read_file('./tmp_ta/type_apply_typed.py')).to_dict())
+        # save_json('./tmp_ta/type_apply_ex.json', {"tests/examples": {"src_files": {"type_apply.py":
+        #           Extractor.extract(read_file('./tmp_ta/type_apply.py'), include_seq2seq=False).to_dict()}}})
+        # save_json('./tmp_ta/type_apply_typed_ex.json', {"tests/examples": {"src_files": {"type_apply_typed.py":
+        #           Extractor.extract(read_file('./tmp_ta/type_apply_typed.py'), include_seq2seq=False).to_dict()}}})
 
     def test_type_apply_pipeline(self):
         ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False)

From 57eacf62be2bb5b0e9f06ed8153963bdae14c73d Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 19 Jul 2021 14:25:09 +0200
Subject: [PATCH 13/31] Remove superfluous assignment line from TypeApplier

---
 libsa4py/cst_transformers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 53e5863..ecb7fb6 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -901,7 +901,6 @@ def __get_fn_param_type(self, param_name: str):
 
     def __get_cls(self, cls: cst.ClassDef) -> dict:
         for c in self.f_processed_dict['classes']:
-            q = self.__get_qualified_name(cls.name)
             if c['q_name'] in self.__get_qualified_name(cls.name):
                 return c
 

From f0aa00ead2c3b7d4ea6012f062d0548216c72b98 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Wed, 21 Jul 2021 17:19:59 +0200
Subject: [PATCH 14/31] A workaround for a very rare case where the class' QN
 doesn't match when applying types

---
 libsa4py/cst_transformers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index ecb7fb6..78df5a8 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -901,7 +901,10 @@ def __get_fn_param_type(self, param_name: str):
 
     def __get_cls(self, cls: cst.ClassDef) -> dict:
         for c in self.f_processed_dict['classes']:
-            if c['q_name'] in self.__get_qualified_name(cls.name):
+            q = self.__get_qualified_name(cls.name)
+            if c['q_name'] == q:
+                return c
+            elif c['q_name'].split(".")[-1] == q.split(".")[-1]:
                 return c
 
     def __get_fn_vars(self, var_name: str) -> dict:

From 19428fd60fdd8ad1c5790cf9a8b2637049939efc Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Wed, 21 Jul 2021 17:25:04 +0200
Subject: [PATCH 15/31] When applying types, first match functions' QN &
 signature first, if no match, then check line no.

---
 libsa4py/cst_transformers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 78df5a8..a98cc56 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -885,9 +885,10 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict:
             fns = self.f_processed_dict['funcs']
 
         for fn in fns:
-            # if fn['q_name'] in self.__get_qualified_name(f_node.name) and \
-            #         set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
-            if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node):
+            if fn['q_name'] in self.__get_qualified_name(f_node.name) and \
+                    set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
+                return fn
+            elif (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node):
                 return fn
 
     def __get_fn_param_type(self, param_name: str):

From 65130df582ea211201838422598b26dc4019c845 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 22 Jul 2021 13:45:31 +0200
Subject: [PATCH 16/31] Count total no. of added types in TypeApplier and its
 pipeline

---
 libsa4py/cst_pipeline.py     | 7 ++++++-
 libsa4py/cst_transformers.py | 5 +++++
 tests/test_type_apply.py     | 4 +++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index e015697..a180ae1 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -267,6 +267,7 @@ def __init__(self, projects_path: str, output_path: str, apply_nlp: bool = True)
 
     def process_project(self, proj_json_path: str):
         proj_json = load_json(proj_json_path)
+        total_added_types = 0
         for p in proj_json.keys():
             for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()):
                 f_read = read_file(join(self.projects_path, f))
@@ -274,8 +275,10 @@ def process_project(self, proj_json_path: str):
                     try:
                         f_parsed = cst.parse_module(f_read)
                         try:
-                            f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(TypeApplier(f_d, self.apply_nlp))
+                            ta = TypeApplier(f_d, self.apply_nlp)
+                            f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta)
                             write_file(join(self.projects_path, f), f_parsed.code)
+                            total_added_types += ta.no_applied_types
                         except KeyError as ke:
                             print(f"A variable not found | project {proj_json_path} | file {f}", ke)
                             traceback.print_exc()
@@ -285,6 +288,8 @@ def process_project(self, proj_json_path: str):
                     except cst._exceptions.ParserSyntaxError as pse:
                         print(f"Can't parsed file {f} in project {proj_json_path}", pse)
 
+        return total_added_types
+
     def run(self, jobs: int):
         proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json')
         proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True)
diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index a98cc56..7d9a963 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -872,6 +872,7 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True):
         self.lambda_d = 0
 
         self.all_applied_types = set()
+        self.no_applied_types = 0
 
         if apply_nlp:
             self.nlp_p = NLPreprocessor().process_identifier
@@ -898,6 +899,7 @@ def __get_fn_param_type(self, param_name: str):
             fn_param_type = self.__name2annotation(fn_param_type_resolved)
             if fn_param_type is not None:
                 self.all_applied_types.add((fn_param_type_resolved, fn_param_type))
+                self.no_applied_types += 1
                 return fn_param_type
 
     def __get_cls(self, cls: cst.ClassDef) -> dict:
@@ -1000,6 +1002,7 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu
             fn_ret_type = self.__name2annotation(fn_ret_type_resolved)
             if fn_ret_type is not None:
                 self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type))
+                self.no_applied_types += 1
                 return updated_node.with_changes(returns=fn_ret_type)
         else:
             return updated_node.with_changes(returns=None)
@@ -1032,6 +1035,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                 t_annot_node = self.__name2annotation(t_annot_node_resolved)
                 if t_annot_node is not None:
                     self.all_applied_types.add((t_annot_node_resolved, t_annot_node))
+                    self.no_applied_types += 1
                     return updated_node.with_changes(body=[cst.AnnAssign(
                         target=original_node.body[0].targets[0].target,
                         value=original_node.body[0].value,
@@ -1052,6 +1056,7 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                 t_annot_node = self.__name2annotation(t_annot_node_resolved)
                 if t_annot_node is not None:
                     self.all_applied_types.add((t_annot_node_resolved, t_annot_node))
+                    self.no_applied_types += 1
                     return updated_node.with_changes(body=[cst.AnnAssign(
                         target=original_node.body[0].target,
                         value=original_node.body[0].value,
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 60c777c..704066e 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -138,7 +138,7 @@ def setUpClass(cls):
 
     def test_type_apply_pipeline(self):
         ta = TypeAnnotatingProjects('./tmp_ta', None, apply_nlp=False)
-        ta.process_project('./examples/type_apply_ex.json')
+        total_no_added_types = ta.process_project('./examples/type_apply_ex.json')
 
         exp_split = test_file_exp.splitlines()
         out_split = read_file('./tmp_ta/type_apply.py').splitlines()
@@ -147,6 +147,8 @@ def test_type_apply_pipeline(self):
         out = """{}""".format("\n".join(out_split[7:]))
 
         self.assertEqual(exp, out)
+        self.assertEqual(total_no_added_types, 16)
+
         # The imported types from typing
         self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7])))
 

From 61667719fadfa9143bd681d6a1aedcf44d9197a3 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Thu, 22 Jul 2021 14:51:53 +0200
Subject: [PATCH 17/31] Improvements to the TypeAnnotatingProjects pipeline

---
 libsa4py/cst_pipeline.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index a180ae1..1c24da5 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -270,8 +270,9 @@ def process_project(self, proj_json_path: str):
         total_added_types = 0
         for p in proj_json.keys():
             for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()):
-                f_read = read_file(join(self.projects_path, f))
-                if len(f_read) != 0:
+                print(f"Adding types to file {f} from project {proj_json_path}")
+                if f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] > 0:
+                    f_read = read_file(join(self.projects_path, f))
                     try:
                         f_parsed = cst.parse_module(f_read)
                         try:
@@ -279,6 +280,7 @@ def process_project(self, proj_json_path: str):
                             f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta)
                             write_file(join(self.projects_path, f), f_parsed.code)
                             total_added_types += ta.no_applied_types
+                            print(f"Applied {ta.no_applied_types} types to file {f} from project {proj_json_path}")
                         except KeyError as ke:
                             print(f"A variable not found | project {proj_json_path} | file {f}", ke)
                             traceback.print_exc()
@@ -293,7 +295,11 @@ def process_project(self, proj_json_path: str):
     def run(self, jobs: int):
         proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json')
         proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True)
-        ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons)
+        start_t = time.time()
+        proj_type_added = ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) \
+                                                                               for p_j in proj_jsons)
+        print(f"Finished applying types in {str(timedelta(seconds=time.time() - start_t))}")
+        print(f"{sum(proj_type_added)} types applied to the whole dataset")
 
 
 class TypeAnnotationsRemoval:

From 95d3c7e8be30ed57831cb78c01595ca558c1b25c Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Fri, 23 Jul 2021 13:32:33 +0200
Subject: [PATCH 18/31] Fix test failure for types removal

---
 libsa4py/cst_transformers.py            | 1 +
 tests/examples/type_apply_typed_ex.json | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 7d9a963..ea537eb 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -853,6 +853,7 @@ def leave_SubscriptElement(self, original_node, updated_node):
             return updated_node
 
 
+# TODO: Write two separate CSTTransformers for applying and removing type annotations
 class TypeApplier(cst.CSTTransformer):
     """
     It applies (inferred) type annotations to a source code file.
diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json
index 374b4a0..388d001 100644
--- a/tests/examples/type_apply_typed_ex.json
+++ b/tests/examples/type_apply_typed_ex.json
@@ -214,7 +214,7 @@
     "tc": false,
     "no_types_annot": {
         "U": 0,
-        "D": 0,
+        "D": 1,
         "I": 0
     },
     "type_annot_cove": 0.0

From 7927d45edf6960d55ff48643ee7760ed9983e26c Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 26 Jul 2021 10:38:09 +0200
Subject: [PATCH 19/31] Improvements to TypeApplier: (1) Better matching of
 function, classes and variables (2) counting failed applied types

---
 libsa4py/cst_transformers.py | 47 ++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index ea537eb..20261db 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -874,6 +874,7 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True):
 
         self.all_applied_types = set()
         self.no_applied_types = 0
+        self.no_failed_applied_types = 0
 
         if apply_nlp:
             self.nlp_p = NLPreprocessor().process_identifier
@@ -886,11 +887,15 @@ def __get_fn(self, f_node: cst.FunctionDef) -> dict:
         else:
             fns = self.f_processed_dict['funcs']
 
+        qn = self.__get_qualified_name(f_node.name)
+        fn_params = set(self.__get_fn_params(f_node.params))
+        fn_lc = self.__get_line_column_no(f_node)
         for fn in fns:
-            if fn['q_name'] in self.__get_qualified_name(f_node.name) and \
-                    set(list(fn['params'].keys())) == set(self.__get_fn_params(f_node.params)):
+            if (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == fn_lc:
                 return fn
-            elif (fn['fn_lc'][0][0], fn['fn_lc'][1][0]) == self.__get_line_column_no(f_node):
+
+        for fn in fns:
+            if fn['q_name'] == qn and set(list(fn['params'].keys())) == fn_params:
                 return fn
 
     def __get_fn_param_type(self, param_name: str):
@@ -902,13 +907,18 @@ def __get_fn_param_type(self, param_name: str):
                 self.all_applied_types.add((fn_param_type_resolved, fn_param_type))
                 self.no_applied_types += 1
                 return fn_param_type
+            else:
+                self.no_failed_applied_types += 1
 
     def __get_cls(self, cls: cst.ClassDef) -> dict:
+        cls_lc = self.__get_line_column_no(cls)
+        cls_qn = self.__get_qualified_name(cls.name)
         for c in self.f_processed_dict['classes']:
-            q = self.__get_qualified_name(cls.name)
-            if c['q_name'] == q:
+            if (c['cls_lc'][0][0], c['cls_lc'][1][0]) == cls_lc:
                 return c
-            elif c['q_name'].split(".")[-1] == q.split(".")[-1]:
+
+        for c in self.f_processed_dict['classes']:
+            if c['q_name'] == cls_qn:
                 return c
 
     def __get_fn_vars(self, var_name: str) -> dict:
@@ -932,24 +942,27 @@ def __get_cls_vars(self, var_name: str) -> dict:
     def __get_mod_vars(self):
         return self.f_processed_dict['variables']
 
-    def __get_var_type_assign_t(self, var_name: str):
+    def __get_var_type_assign_t(self, var_name: str, var_node):
         t: str = None
+        var_line_no = self.__get_line_column_no(var_node)
         if len(self.cls_visited) != 0:
             if len(self.fn_visited) != 0:
                 # A class method's variable
-                if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count:
+                if self.fn_visited[-1][0]['fn_var_ln'][var_name][0][0] == var_line_no[0]:
                     t = self.__get_fn_vars(self.nlp_p(var_name))
             else:
                 # A class variable
-                if self.cls_visited[-1][1][var_name] == self.last_visited_assign_t_count:
+                if self.cls_visited[-1][0]["cls_var_ln"][var_name][0][0] == var_line_no[0]:
                     t = self.__get_cls_vars(self.nlp_p(var_name))
         elif len(self.fn_visited) != 0:
             # A module function's variable
-            if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count:
+            #if self.fn_visited[-1][1][var_name] == self.last_visited_assign_t_count:
+            if self.fn_visited[-1][0]['fn_var_ln'][var_name][0][0] == var_line_no[0]:
                 t = self.__get_fn_vars(self.nlp_p(var_name))
         else:
             # A module's variables
-            t = self.__get_mod_vars()[self.nlp_p(var_name)]
+            if self.f_processed_dict['mod_var_ln'][var_name][0][0] == var_line_no[0]:
+                t = self.__get_mod_vars()[self.nlp_p(var_name)]
         return t
 
     def __get_var_type_an_assign(self, var_name: str):
@@ -1005,6 +1018,8 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu
                 self.all_applied_types.add((fn_ret_type_resolved, fn_ret_type))
                 self.no_applied_types += 1
                 return updated_node.with_changes(returns=fn_ret_type)
+            else:
+                self.no_failed_applied_types += 1
         else:
             return updated_node.with_changes(returns=None)
 
@@ -1026,10 +1041,12 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                 target=match.DoNotCare())])])):
             if match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
                 target=match.Name(value=match.DoNotCare()))])])):
-                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value)
+                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.value,
+                                                 original_node.body[0].targets[0].target)
             elif match.matches(original_node, match.SimpleStatementLine(body=[match.Assign(targets=[match.AssignTarget(
                 target=match.Attribute(value=match.Name(value=match.DoNotCare()), attr=match.Name(value=match.DoNotCare())))])])):
-                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value)
+                t = self.__get_var_type_assign_t(original_node.body[0].targets[0].target.attr.value,
+                                                 original_node.body[0].targets[0].target)
 
             if t is not None:
                 t_annot_node_resolved = self.resolve_type_alias(t)
@@ -1044,6 +1061,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         equal=cst.AssignEqual(whitespace_after=original_node.body[0].targets[0].whitespace_after_equal,
                                             whitespace_before=original_node.body[0].targets[0].whitespace_before_equal))]
                     )
+                else:
+                    self.no_failed_applied_types += 1
         # Typed variables
         elif match.matches(original_node, match.SimpleStatementLine(body=[match.AnnAssign(target=match.DoNotCare(),
                                                                                           value=match.MatchIfTrue(lambda v: v is not None))])):
@@ -1063,6 +1082,8 @@ def leave_SimpleStatementLine(self, original_node: cst.SimpleStatementLine,
                         value=original_node.body[0].value,
                         annotation=t_annot_node,
                         equal=original_node.body[0].equal)])
+                else:
+                    self.no_failed_applied_types += 1
             else:
                 return updated_node.with_changes(body=[cst.Assign(targets=[cst.AssignTarget(target=original_node.body[0].target,
                                                                                             whitespace_before_equal=original_node.body[0].equal.whitespace_before,

From d41fea3615b8b24ba22429e05254d5ccfb48733e Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 26 Jul 2021 10:40:14 +0200
Subject: [PATCH 20/31] Improvments to the pipeline of TypeApplier: (1) Dry run
 (2) Assertion for no. of applied types

---
 libsa4py/__main__.py     |  6 +++++-
 libsa4py/cst_pipeline.py | 22 ++++++++++++++++++----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py
index 16a0b31..6e4e1de 100644
--- a/libsa4py/__main__.py
+++ b/libsa4py/__main__.py
@@ -12,7 +12,7 @@ def process_projects(args):
 
 
 def apply_types_projects(args):
-    tap = TypeAnnotatingProjects(args.p, args.o)
+    tap = TypeAnnotatingProjects(args.p, args.o, args.dry_run)
     tap.run(args.j)
 
 
@@ -56,6 +56,10 @@ def main():
     apply_parser.add_argument("--p", required=True, type=str, help="Path to Python projects")
     apply_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects")
     apply_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects")
+    apply_parser.add_argument("--d", dest='dry_run', action='store_true',
+                                help="Dry run does not apply types to the dataset's files")
+
+    apply_parser.set_defaults(dry_run=False)
     apply_parser.set_defaults(func=apply_types_projects)
 
     remove_parser = sub_parsers.add_parser('remove')
diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 1c24da5..cc7963e 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -1,3 +1,4 @@
+from libsa4py.cst_visitor import TypeAnnotationCounter
 import os
 import traceback
 import random
@@ -260,17 +261,21 @@ class TypeAnnotatingProjects:
     It applies the inferred type annotations to the input dataset
     """
 
-    def __init__(self, projects_path: str, output_path: str, apply_nlp: bool = True):
+    def __init__(self, projects_path: str, output_path: str, dry_run: bool = False,
+                 apply_nlp: bool = True):
         self.projects_path = projects_path
         self.output_path = output_path
+        self.dry_run = dry_run
         self.apply_nlp = apply_nlp
 
     def process_project(self, proj_json_path: str):
         proj_json = load_json(proj_json_path)
         total_added_types = 0
+        total_no_types = 0
         for p in proj_json.keys():
             for i, (f, f_d) in enumerate(proj_json[p]['src_files'].items()):
                 print(f"Adding types to file {f} from project {proj_json_path}")
+                total_no_types += f_d['no_types_annot']['I'] + f_d['no_types_annot']['D']
                 if f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] > 0:
                     f_read = read_file(join(self.projects_path, f))
                     try:
@@ -278,19 +283,23 @@ def process_project(self, proj_json_path: str):
                         try:
                             ta = TypeApplier(f_d, self.apply_nlp)
                             f_parsed = cst.metadata.MetadataWrapper(f_parsed).visit(ta)
-                            write_file(join(self.projects_path, f), f_parsed.code)
+                            if not self.dry_run:
+                                write_file(join(self.projects_path, f), f_parsed.code)
                             total_added_types += ta.no_applied_types
                             print(f"Applied {ta.no_applied_types} types to file {f} from project {proj_json_path}")
+                            assert f_d['no_types_annot']['I'] + f_d['no_types_annot']['D'] <= self.__get_no_applied_types(f_parsed.code) + ta.no_failed_applied_types
                         except KeyError as ke:
                             print(f"A variable not found | project {proj_json_path} | file {f}", ke)
                             traceback.print_exc()
                         except TypeError as te:
                             print(f"Project {proj_json_path} | file {f}", te)
                             traceback.print_exc()
+                        except AssertionError as te:
+                            print(f"[AssertionError] Project {proj_json_path} | file {f}", te)
                     except cst._exceptions.ParserSyntaxError as pse:
                         print(f"Can't parsed file {f} in project {proj_json_path}", pse)
 
-        return total_added_types
+        return total_added_types, total_no_types
 
     def run(self, jobs: int):
         proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json')
@@ -299,8 +308,13 @@ def run(self, jobs: int):
         proj_type_added = ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) \
                                                                                for p_j in proj_jsons)
         print(f"Finished applying types in {str(timedelta(seconds=time.time() - start_t))}")
-        print(f"{sum(proj_type_added)} types applied to the whole dataset")
+        print(f"{sum([a for a, t in proj_type_added]):,}/{sum([t for a, t in proj_type_added]):,} types applied to the whole dataset")
 
+    def __get_no_applied_types(self, code: str) -> int:
+        f_applied_p = cst.parse_module(code)
+        tac = TypeAnnotationCounter()
+        f_applied_p.visit(tac)
+        return tac.total_no_type_annot
 
 class TypeAnnotationsRemoval:
     """

From 3111e8445540843d0b3ebebf691f152c2f565a7e Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 26 Jul 2021 11:12:24 +0200
Subject: [PATCH 21/31] Fix test failure for the TypeAppier

---
 tests/examples/type_apply_ex.json       |  20 +
 tests/examples/type_apply_typed_ex.json | 505 ++++++++++++++----------
 tests/test_type_apply.py                |   8 +-
 3 files changed, 327 insertions(+), 206 deletions(-)

diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json
index 5976215..817ffe7 100644
--- a/tests/examples/type_apply_ex.json
+++ b/tests/examples/type_apply_ex.json
@@ -105,6 +105,16 @@
                     {
                         "name": "Delta",
                         "q_name": "Foo.Delta",
+                        "cls_lc": [
+                            [
+                                10,
+                                4
+                            ],
+                            [
+                                11,
+                                31
+                            ]
+                        ],
                         "variables": {
                             "foo_d": ""
                         },
@@ -128,6 +138,16 @@
                     {
                         "name": "Foo",
                         "q_name": "Foo",
+                        "cls_lc": [
+                            [
+                                8,
+                                0
+                            ],
+                            [
+                                29,
+                                16
+                            ]
+                        ],
                         "variables": {
                             "foo_v": "str",
                             "foo_p": "pathlib.Path"
diff --git a/tests/examples/type_apply_typed_ex.json b/tests/examples/type_apply_typed_ex.json
index 388d001..4ce162c 100644
--- a/tests/examples/type_apply_typed_ex.json
+++ b/tests/examples/type_apply_typed_ex.json
@@ -1,224 +1,325 @@
 {
-  "tests/examples": {
-    "src_files": {
-      "type_apply_typed.py": {
-    "untyped_seq": "a = [number] [EOL] l = [ [number] , [number] , [number] ] [EOL] c = [number] [EOL] [EOL] def foo ( x , y ) : [EOL] z = x + y [EOL] return z [EOL] [EOL] class Bar : [EOL] bar_var1 = [string] [EOL] bar_var2 = [number] [EOL] def __init__ ( a , b ) : [EOL] self . a = a [EOL] self . b = b [EOL] def delta ( n ) : [EOL] return [ [number] ] * p [EOL]",
-    "typed_seq": "$builtins.int$ 0 0 0 $List[int]$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 $builtins.str$ 0 0 0 $builtins.float$ 0 0 0 0 0 0 $builtins.int$ 0 0 0 0 0 0 0 $builtins.int$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 $List[float]$ 0 $builtins.int$ 0 0 0 0 0 0 0 0 0 0",
-    "imports": [],
-    "variables": {
-        "a": "",
-        "l": "",
-        "c": "",
-        "h": ""
-    },
-    "mod_var_occur": {
-        "a": [
-            [
-                "self",
-                "a",
-                "builtins",
-                "int",
-                "a"
-            ]
-        ],
-        "l": [],
-        "c": []
-    },
-    "classes": [
-        {
-            "name": "Bar",
-            "q_name": "Bar",
-            "variables": {
-                "bar_var1": "",
-                "bar_var2": ""
-            },
-            "cls_var_occur": {
-                "bar_var1": [],
-                "bar_var2": []
-            },
-            "funcs": [
-                {
-                    "name": "__init__",
-                    "q_name": "Bar.__init__",
-                    "fn_lc": [
+    "tests/examples": {
+        "src_files": {
+            "type_apply_typed.py": {
+                "untyped_seq": "",
+                "typed_seq": "",
+                "imports": [],
+                "variables": {
+                    "a": "",
+                    "l": "",
+                    "c": "",
+                    "h": "builtins.dict"
+                },
+                "mod_var_occur": {
+                    "a": [
+                        [
+                            "self",
+                            "a",
+                            "a"
+                        ]
+                    ],
+                    "l": [],
+                    "c": [],
+                    "h": []
+                },
+                "mod_var_ln": {
+                    "a": [
                         [
-                            11,
-                            4
+                            1,
+                            0
                         ],
                         [
-                            13,
-                            18
+                            1,
+                            1
                         ]
                     ],
-                    "params": {
-                        "a": "",
-                        "b": ""
-                    },
-                    "ret_exprs": [],
-                    "params_occur": {
-                        "a": [
-                            [
-                                "self",
-                                "a",
-                                "builtins",
-                                "int",
-                                "a"
-                            ]
+                    "l": [
+                        [
+                            2,
+                            0
                         ],
-                        "b": [
-                            [
-                                "self",
-                                "b",
-                                "b"
-                            ]
+                        [
+                            2,
+                            1
                         ]
-                    },
-                    "ret_type": "",
-                    "variables": {
-                        "a": "",
-                        "b": ""
-                    },
-                    "fn_var_occur": {
-                        "a": [
+                    ],
+                    "c": [
+                        [
+                            3,
+                            0
+                        ],
+                        [
+                            3,
+                            1
+                        ]
+                    ],
+                    "h": [
+                        [
+                            4,
+                            0
+                        ],
+                        [
+                            4,
+                            1
+                        ]
+                    ]
+                },
+                "classes": [
+                    {
+                        "name": "Bar",
+                        "q_name": "Bar",
+                        "cls_lc": [
+                            [
+                                8,
+                                0
+                            ],
                             [
-                                "self",
-                                "a",
-                                "builtins",
-                                "int",
-                                "a"
+                                15,
+                                25
                             ]
                         ],
-                        "b": [
-                            [
-                                "self",
-                                "b",
-                                "b"
+                        "variables": {
+                            "bar_var1": "",
+                            "bar_var2": ""
+                        },
+                        "cls_var_occur": {
+                            "bar_var1": [],
+                            "bar_var2": []
+                        },
+                        "cls_var_ln": {
+                            "bar_var1": [
+                                [
+                                    9,
+                                    4
+                                ],
+                                [
+                                    9,
+                                    12
+                                ]
+                            ],
+                            "bar_var2": [
+                                [
+                                    10,
+                                    4
+                                ],
+                                [
+                                    10,
+                                    12
+                                ]
                             ]
+                        },
+                        "funcs": [
+                            {
+                                "name": "__init__",
+                                "q_name": "Bar.__init__",
+                                "fn_lc": [
+                                    [
+                                        11,
+                                        4
+                                    ],
+                                    [
+                                        13,
+                                        18
+                                    ]
+                                ],
+                                "params": {
+                                    "a": "",
+                                    "b": ""
+                                },
+                                "ret_exprs": [],
+                                "params_occur": {
+                                    "a": [
+                                        [
+                                            "self",
+                                            "a",
+                                            "a"
+                                        ]
+                                    ],
+                                    "b": [
+                                        [
+                                            "self",
+                                            "b",
+                                            "b"
+                                        ]
+                                    ]
+                                },
+                                "ret_type": "",
+                                "variables": {
+                                    "a": "",
+                                    "b": ""
+                                },
+                                "fn_var_occur": {
+                                    "a": [
+                                        [
+                                            "self",
+                                            "a",
+                                            "a"
+                                        ]
+                                    ],
+                                    "b": [
+                                        [
+                                            "self",
+                                            "b",
+                                            "b"
+                                        ]
+                                    ]
+                                },
+                                "fn_var_ln": {
+                                    "a": [
+                                        [
+                                            12,
+                                            8
+                                        ],
+                                        [
+                                            12,
+                                            14
+                                        ]
+                                    ],
+                                    "b": [
+                                        [
+                                            13,
+                                            8
+                                        ],
+                                        [
+                                            13,
+                                            14
+                                        ]
+                                    ]
+                                },
+                                "params_descr": {
+                                    "a": "",
+                                    "b": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            },
+                            {
+                                "name": "delta",
+                                "q_name": "Bar.delta",
+                                "fn_lc": [
+                                    [
+                                        14,
+                                        4
+                                    ],
+                                    [
+                                        15,
+                                        25
+                                    ]
+                                ],
+                                "params": {
+                                    "n": ""
+                                },
+                                "ret_exprs": [
+                                    "return [2.17] * p"
+                                ],
+                                "params_occur": {
+                                    "n": []
+                                },
+                                "ret_type": "",
+                                "variables": {},
+                                "fn_var_occur": {},
+                                "fn_var_ln": {},
+                                "params_descr": {
+                                    "n": ""
+                                },
+                                "docstring": {
+                                    "func": null,
+                                    "ret": null,
+                                    "long_descr": null
+                                }
+                            }
                         ]
-                    },
-                    "params_descr": {
-                        "a": "",
-                        "b": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
                     }
-                },
-                {
-                    "name": "delta",
-                    "q_name": "Bar.delta",
-                    "fn_lc": [
-                        [
-                            14,
-                            4
+                ],
+                "funcs": [
+                    {
+                        "name": "foo",
+                        "q_name": "foo",
+                        "fn_lc": [
+                            [
+                                5,
+                                0
+                            ],
+                            [
+                                7,
+                                12
+                            ]
                         ],
-                        [
-                            15,
-                            25
-                        ]
-                    ],
-                    "params": {
-                        "n": ""
-                    },
-                    "ret_exprs": [
-                        "return [2.17] * p"
-                    ],
-                    "params_occur": {
-                        "n": []
-                    },
-                    "ret_type": "",
-                    "variables": {},
-                    "fn_var_occur": {},
-                    "params_descr": {
-                        "n": ""
-                    },
-                    "docstring": {
-                        "func": null,
-                        "ret": null,
-                        "long_descr": null
+                        "params": {
+                            "x": "",
+                            "y": ""
+                        },
+                        "ret_exprs": [
+                            "return z"
+                        ],
+                        "params_occur": {
+                            "x": [
+                                [
+                                    "z",
+                                    "x",
+                                    "y"
+                                ]
+                            ],
+                            "y": [
+                                [
+                                    "z",
+                                    "x",
+                                    "y"
+                                ]
+                            ]
+                        },
+                        "ret_type": "",
+                        "variables": {
+                            "z": ""
+                        },
+                        "fn_var_occur": {
+                            "z": [
+                                [
+                                    "z",
+                                    "x",
+                                    "y"
+                                ]
+                            ]
+                        },
+                        "fn_var_ln": {
+                            "z": [
+                                [
+                                    6,
+                                    4
+                                ],
+                                [
+                                    6,
+                                    5
+                                ]
+                            ]
+                        },
+                        "params_descr": {
+                            "x": "",
+                            "y": ""
+                        },
+                        "docstring": {
+                            "func": null,
+                            "ret": null,
+                            "long_descr": null
+                        }
                     }
-                }
-            ]
-        }
-    ],
-    "funcs": [
-        {
-            "name": "foo",
-            "q_name": "foo",
-            "fn_lc": [
-                [
-                    5,
-                    0
                 ],
-                [
-                    7,
-                    12
-                ]
-            ],
-            "params": {
-                "x": "",
-                "y": ""
-            },
-            "ret_exprs": [
-                "return z"
-            ],
-            "params_occur": {
-                "x": [
-                    [
-                        "z",
-                        "builtins",
-                        "int",
-                        "x",
-                        "y"
-                    ]
+                "set": null,
+                "tc": [
+                    false,
+                    null
                 ],
-                "y": [
-                    [
-                        "z",
-                        "builtins",
-                        "int",
-                        "x",
-                        "y"
-                    ]
-                ]
-            },
-            "ret_type": "",
-            "variables": {
-                "z": ""
-            },
-            "fn_var_occur": {
-                "z": [
-                    [
-                        "z",
-                        "builtins",
-                        "int",
-                        "x",
-                        "y"
-                    ]
-                ]
-            },
-            "params_descr": {
-                "x": "",
-                "y": ""
-            },
-            "docstring": {
-                "func": null,
-                "ret": null,
-                "long_descr": null
+                "no_types_annot": {
+                    "U": 14,
+                    "D": 1,
+                    "I": 0
+                },
+                "type_annot_cove": 0.07
             }
         }
-    ],
-    "set": null,
-    "tc": false,
-    "no_types_annot": {
-        "U": 0,
-        "D": 1,
-        "I": 0
-    },
-    "type_annot_cove": 0.0
-}
     }
-  }
 }
\ No newline at end of file
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 704066e..6b23b63 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -12,7 +12,7 @@
 dff = pd.DataFrame([1,2])
 lit = "Hello!"
 class Foo:
-    foo_v: str = 'Hello, Foo!'
+    foo_v = 'Hello, Foo!'
     class Delta:
         foo_d = 'Hello, Delta!'
     foo_p = Path('/home/foo/bar')
@@ -54,7 +54,7 @@ def Bar(x=['apple', 'orange'], *, c):
 dff: typing.List[pandas.arrays.PandasArray] = pd.DataFrame([1,2])
 lit: typing.Literal = "Hello!"
 class Foo:
-    foo_v: str = 'Hello, Foo!'
+    foo_v = 'Hello, Foo!'
     class Delta:
         foo_d = 'Hello, Delta!'
     foo_p: pathlib.Path = Path('/home/foo/bar')
@@ -74,7 +74,7 @@ def get_e(self):
     def get_e(self, y: builtins.str):
         Foo.foo_v = y
         return Foo.foo_v
-    foo_v = "No"
+    foo_v: str = "No"
 def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[builtins.str]:
     v: typing.List[builtins.str] = x
     l = lambda e: e+1
@@ -147,7 +147,7 @@ def test_type_apply_pipeline(self):
         out = """{}""".format("\n".join(out_split[7:]))
 
         self.assertEqual(exp, out)
-        self.assertEqual(total_no_added_types, 16)
+        self.assertEqual(total_no_added_types[0], 16)
 
         # The imported types from typing
         self.assertEqual(Counter(" ".join(exp_split[0:7])), Counter(" ".join(out_split[0:7])))

From 8efb7b1d41f41054884f1a478e29a4562c2a322e Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Wed, 28 Jul 2021 11:05:58 +0200
Subject: [PATCH 22/31] Improvements to the TypeRemoval pipeline : (1) Dry run
 (2) better multi-processing (3) Max try

---
 libsa4py/__main__.py     |   6 +-
 libsa4py/cst_pipeline.py | 139 +++++++++++++++++++++++++++------------
 2 files changed, 102 insertions(+), 43 deletions(-)

diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py
index 6e4e1de..8dc283e 100644
--- a/libsa4py/__main__.py
+++ b/libsa4py/__main__.py
@@ -17,7 +17,7 @@ def apply_types_projects(args):
 
 
 def remove_err_type_annotations(args):
-    tar = TypeAnnotationsRemoval(args.p, args.o, "")
+    tar = TypeAnnotationsRemoval(args.p, args.o, "", args.l, args.dry_run)
     tar.run(args.j)
 
 
@@ -66,6 +66,10 @@ def main():
     remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects")
     remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects")
     remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files")
+    remove_parser.add_argument("--l", required=False, type=int, help="Number of projects to process")
+    remove_parser.add_argument("--d", dest='dry_run', action='store_true',
+                                help="Dry run does not remove types from the dataset's files")
+    remove_parser.set_defaults(dry_run=False)
     remove_parser.set_defaults(func=remove_err_type_annotations)
 
     args = arg_parser.parse_args()
diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index cc7963e..ffa870d 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -4,6 +4,7 @@
 import random
 import csv
 import time
+import queue
 
 from typing import List, Dict, Tuple
 from os.path import join
@@ -11,7 +12,8 @@
 from pathlib import Path
 from datetime import timedelta
 from joblib import delayed
-from multiprocessing import Manager
+from multiprocessing import Manager, Process, Queue, managers
+from multiprocessing.queues import Queue
 from dpu_utils.utils.dataloading import load_jsonl_gz
 from libsa4py.cst_extractor import Extractor
 from libsa4py.cst_transformers import TypeApplier
@@ -321,13 +323,17 @@ class TypeAnnotationsRemoval:
     Removes type annotations that cannot be type-checked by mypy
     """
 
-    def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, apply_nlp: bool = True):
+    def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None,
+                 dry_run: bool = False, apply_nlp: bool = True):
         self.projects_path = projects_path
         self.processed_projects_path = processed_projects_path
         self.output_path = output_path
+        self.no_projects_limit = no_projects_limit
+        self.dry_run = dry_run
         self.apply_nlp = apply_nlp
 
-    def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
+    #def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
+    def process_file(self, q: Queue, is_f_loader_done, tc_res: dict):
         # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on.
         # init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f),
         #                                                  MypyManager('mypy', MAX_TC_TIME))
@@ -336,49 +342,97 @@ def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
         #     return
         # else:
         # Only files with type annotations
-        if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
+        while not is_f_loader_done.value or q.qsize() != 0:
             try:
-                tmp_f = create_tmp_file(".py")
-                f_read = read_file(join(self.projects_path, f))
-                f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
-                                                                                        tmp_f)
-                print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
-                    total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']}")
-                tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
-                                "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
-                # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
-                write_file(join(self.projects_path, f), f_tc_code)
-            except Exception as e:
-                print(f"f: {f} | e: {e}")
-                traceback.print_exc()
-            finally:
-                delete_tmp_file(tmp_f)
+                f, f_d_repr = q.get(True, 1)
+                if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
+                    try:
+                        tmp_f = create_tmp_file(".py")
+                        f_read = read_file(join(self.projects_path, f))
+                        f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
+                                                                                                tmp_f)
+                        print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
+                            total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}")
+                        tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
+                                        "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
+                        # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
+                        if not self.dry_run and tc_errs == 0:
+                            write_file(join(self.projects_path, f), f_tc_code)
+                    except Exception as e:
+                        print(f"f: {f} | e: {e}")
+                        traceback.print_exc()
+                    finally:
+                        delete_tmp_file(tmp_f)
+            except queue.Empty as e:
+                print(f"Worker {os.getpid()} finished! Queue's empty!")
+                print(f"File loader working {is_f_loader_done.value} and queue size {q.qsize()}")
 
     def run(self, jobs: int):
-        merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
-        not_tced_src_f: List[Tuple[str, dict]] = []
-        for p, p_v in list(merged_projects['projects'].items()):
-            for f, f_v in p_v['src_files'].items():
-                if not f_v['tc'][0] and f_v['tc'] != [False, None]:
-                    not_tced_src_f.append((f, f_v))
-
-        del merged_projects
-        # not_tced_src_f = not_tced_src_f[:250]
-        # print("L:", len(not_tced_src_f))
         manager = Manager()
+        q = manager.Queue()
+        is_f_loader_done = manager.Value('i', False)
+        
+        file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done))
+        file_loader.start()
+        #file_loader.join()
+
+        print("File loader started!")
+
+        # merged_projects = load_json(join(self.processed_projects_path, "merged_all_projects.json"))
+        # not_tced_src_f: List[Tuple[str, dict]] = []
+        # for p, p_v in list(merged_projects['projects'].items()):
+        #     for f, f_v in p_v['src_files'].items():
+        #         if not f_v['tc'][0] and f_v['tc'] != [False, None]:
+        #             not_tced_src_f.append((f, f_v))
+
+        # del merged_projects
+        # # not_tced_src_f = not_tced_src_f[:250]
+        # # print("L:", len(not_tced_src_f))
+        # manager = Manager()
+        time.sleep(5)
+        start_t = time.time()
         tc_res = manager.dict()
-        ParallelExecutor(n_jobs=jobs)(total=len(not_tced_src_f))(delayed(self.process_file)(f, f_d, tc_res) \
-                                                                 for f, f_d in not_tced_src_f)
-
+        file_processors = []
+        for j in range(jobs):
+            p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res))
+            p.daemon = True
+            file_processors.append(p)
+            p.start()
+
+        
+        for p in file_processors:
+            p.join()
+        file_loader.join()
+        # ParallelExecutor(n_jobs=jobs)(total=0)(delayed(self.process_file)(f, f_d, tc_res) \
+        #                                                          for f, f_d in not_tced_src_f)
+        print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}")
         save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy())
-
+        
+    def __load_projects_files(self, q: Queue, is_done):
+        proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json')
+        proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons
+        f_loaded = 0
+        for p_j in proj_jsons:
+            proj_json = load_json(p_j)
+            for _, p_v in proj_json.items():
+                for f, f_v in p_v['src_files'].items():
+                    if not f_v['tc'][0] and f_v['tc'] != [False, None] and f_v['tc'][1] <= 100:
+                        q.put((f, f_v))
+                        f_loaded += 1
+                        #print("Adding files to Queue...")
+        is_done.value = True
+        print(f"Loaded {f_loaded} Python files")
+            
     def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
                                       f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:
 
         type_annots_removed: List[str] = []
+        no_try = 0
+        MAX_TRY = 10
 
         def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
             tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+            nonlocal no_try
             if no_tc_err is not None:
                 if tc:
                     type_annots_removed.append(org_gt)
@@ -386,8 +440,9 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     curr_f_code = f_code
                     curr_no_tc_err = no_tc_err
                     type_annots_removed.append(org_gt)
-                elif no_tc_err == curr_no_tc_err:
+                else:
                     org_gt_d = org_gt
+                    no_try += 1
 
             return tc, no_tc_err, f_code
 
@@ -408,7 +463,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     f_d_repr['variables'][m_v] = m_v_t
                 tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t,
                                                       f_d_repr['variables'][m_v])
-                if tc:
+                if tc or no_try > MAX_TRY:
                     return f_code, no_tc_err, type_annots_removed
 
         for i, fn in enumerate(f_d_repr['funcs']):
@@ -428,7 +483,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     f_d_repr['funcs'][i]['params'][p_n] = p_t
                     tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
                                                           f_d_repr['funcs'][i]['params'][p_n])
-                    if tc:
+                    if tc or no_try > MAX_TRY:
                         return f_code, no_tc_err, type_annots_removed
 
             for fn_v, fn_v_t in fn['variables'].items():
@@ -447,7 +502,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
                     tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
                                                           f_d_repr['funcs'][i]['variables'][fn_v])
-                    if tc:
+                    if tc or no_try > MAX_TRY:
                         return f_code, no_tc_err, type_annots_removed
 
             # The return type for module-level functions
@@ -467,7 +522,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     f_d_repr['funcs'][i]['ret_type'] = org_t
                 tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
                                                       f_d_repr['funcs'][i]['ret_type'])
-                if tc:
+                if tc or no_try > MAX_TRY:
                     return f_code, no_tc_err, type_annots_removed
 
         # The type of class-level vars
@@ -488,7 +543,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
                     tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t,
                                                           f_d_repr['classes'][c_i]['variables'][c_v])
-                    if tc:
+                    if tc or no_try > MAX_TRY:
                         return f_code, no_tc_err, type_annots_removed
 
             # The type of arguments for class-level functions
@@ -509,7 +564,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
                         tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
                                                               f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
-                        if tc:
+                        if tc or no_try > MAX_TRY:
                             return f_code, no_tc_err, type_annots_removed
 
                 # The type of local variables for class-level functions
@@ -529,7 +584,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
                         tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
                                                               f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
-                        if tc:
+                        if tc or no_try > MAX_TRY:
                             return f_code, no_tc_err, type_annots_removed
 
                 # The return type for class-level functions
@@ -550,7 +605,7 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
                     tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
                                                           f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
-                    if tc:
+                    if tc or no_try > MAX_TRY:
                         return f_code, no_tc_err, type_annots_removed
 
         return out_f_code, init_no_tc_err, type_annots_removed

From 1fc8156571cbfa5a4580a0485231e441dd14972c Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Wed, 28 Jul 2021 11:08:58 +0200
Subject: [PATCH 23/31] Run mypy with the file's abs. path, which may improve
 TC in some cases

---
 libsa4py/type_check.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py
index abcfe28..40a9f19 100644
--- a/libsa4py/type_check.py
+++ b/libsa4py/type_check.py
@@ -68,10 +68,12 @@ def _build_tc_cmd(self, fpath):
 
     def _type_check(self, fpath):
         try:
-            cwd = os.getcwd()
-            os.chdir(dirname(fpath))
+            # cwd = os.getcwd()
+            # os.chdir(dirname(fpath))
+            # Runs mypy with the file's absolute path
+            # It may improve detection of type erorrs in some cases!
             result = subprocess.run(
-                self._build_tc_cmd(basename(fpath)),
+                self._build_tc_cmd(fpath), # basename(fpath)
                 capture_output=True,
                 text=True,
                 timeout=self._timeout,
@@ -81,8 +83,8 @@ def _type_check(self, fpath):
             return retcode, outlines
         except subprocess.TimeoutExpired:
             raise TypeCheckingTooLong
-        finally:
-            os.chdir(cwd)
+        # finally:
+        #     os.chdir(cwd)
 
     @abstractmethod
     def _check_tc_outcome(self, returncode, outlines):

From a89338c04084da339ae67d18703347b28d8cc3f7 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 2 Aug 2021 14:29:34 +0200
Subject: [PATCH 24/31] Fixing re-importing names when applying types

---
 libsa4py/cst_transformers.py      |  17 ++++-
 tests/examples/type_apply_ex.json | 105 +++++++++++++++---------------
 tests/test_type_apply.py          |   5 +-
 3 files changed, 70 insertions(+), 57 deletions(-)

diff --git a/libsa4py/cst_transformers.py b/libsa4py/cst_transformers.py
index 20261db..af078a7 100644
--- a/libsa4py/cst_transformers.py
+++ b/libsa4py/cst_transformers.py
@@ -876,6 +876,8 @@ def __init__(self, f_processeed_dict: dict, apply_nlp: bool=True):
         self.no_applied_types = 0
         self.no_failed_applied_types = 0
 
+        self.imported_names: List[str] = []
+
         if apply_nlp:
             self.nlp_p = NLPreprocessor().process_identifier
         else:
@@ -1116,20 +1118,29 @@ def visit_AssignTarget(self, node: cst.AssignTarget):
     def leave_Module(self, original_node: cst.Module, updated_node: cst.Module):
         return updated_node.with_changes(body=self.__get_required_imports() + list(updated_node.body))
 
+    def visit_ImportAlias(self, node: cst.ImportAlias):
+        self.imported_names.extend([n.value for n in match.findall(node.name, match.Name(value=match.DoNotCare()))])
+
     # TODO: Check the imported modules before adding new ones
     def __get_required_imports(self):
-        def find_required_modules(all_types):
+        def find_required_modules(all_types, imported_names):
             req_mod = set()
             for _, a_node in all_types:
                 m = match.findall(a_node.annotation, match.Attribute(value=match.DoNotCare(), attr=match.DoNotCare()))
                 if len(m) != 0:
                     for i in m:
-                        req_mod.add([n.value for n in match.findall(i, match.Name(value=match.DoNotCare()))][0])
+                        mod_imp = [n.value for n in match.findall(i, match.Name(value=match.DoNotCare()))][0]
+                        if mod_imp not in imported_names:
+                            req_mod.add(mod_imp)
+            # if n.value not in imported_names
+            print(req_mod)
             return req_mod
 
         req_imports = []
-        all_req_mods = find_required_modules(self.all_applied_types)
+        self.imported_names = set(self.imported_names)
+        all_req_mods = find_required_modules(self.all_applied_types, self.imported_names)
         all_type_names = set(chain.from_iterable(map(lambda t: regex.findall(r"\w+", t[0]), self.all_applied_types)))
+        all_type_names = all_type_names - self.imported_names
 
         typing_imports = PY_TYPING_MOD & all_type_names
         collection_imports = PY_COLLECTION_MOD & all_type_names
diff --git a/tests/examples/type_apply_ex.json b/tests/examples/type_apply_ex.json
index 817ffe7..608d44e 100644
--- a/tests/examples/type_apply_ex.json
+++ b/tests/examples/type_apply_ex.json
@@ -6,7 +6,8 @@
                 "typed_seq": "",
                 "imports": [
                     "pathlib",
-                    "Path"
+                    "Path",
+                    "pandas"
                 ],
                 "variables": {
                     "x": "builtins.int",
@@ -42,61 +43,61 @@
                 "mod_var_ln": {
                     "x": [
                         [
-                            2,
+                            3,
                             0
                         ],
                         [
-                            2,
+                            3,
                             1
                         ]
                     ],
                     "l": [
                         [
-                            3,
+                            4,
                             0
                         ],
                         [
-                            3,
+                            4,
                             1
                         ]
                     ],
                     "c": [
                         [
-                            4,
+                            5,
                             0
                         ],
                         [
-                            4,
+                            5,
                             1
                         ]
                     ],
                     "df": [
                         [
-                            5,
+                            6,
                             0
                         ],
                         [
-                            5,
+                            6,
                             2
                         ]
                     ],
                     "dff": [
                         [
-                            6,
+                            7,
                             0
                         ],
                         [
-                            6,
+                            7,
                             3
                         ]
                     ],
                     "lit": [
                         [
-                            7,
+                            8,
                             0
                         ],
                         [
-                            7,
+                            8,
                             3
                         ]
                     ]
@@ -107,11 +108,11 @@
                         "q_name": "Foo.Delta",
                         "cls_lc": [
                             [
-                                10,
+                                11,
                                 4
                             ],
                             [
-                                11,
+                                12,
                                 31
                             ]
                         ],
@@ -124,11 +125,11 @@
                         "cls_var_ln": {
                             "foo_d": [
                                 [
-                                    11,
+                                    12,
                                     8
                                 ],
                                 [
-                                    11,
+                                    12,
                                     13
                                 ]
                             ]
@@ -140,16 +141,16 @@
                         "q_name": "Foo",
                         "cls_lc": [
                             [
-                                8,
+                                9,
                                 0
                             ],
                             [
-                                29,
-                                16
+                                30,
+                                30
                             ]
                         ],
                         "variables": {
-                            "foo_v": "str",
+                            "foo_v": "builtins.str",
                             "foo_p": "pathlib.Path"
                         },
                         "cls_var_occur": {
@@ -173,21 +174,21 @@
                         "cls_var_ln": {
                             "foo_v": [
                                 [
-                                    29,
+                                    30,
                                     4
                                 ],
                                 [
-                                    29,
+                                    30,
                                     9
                                 ]
                             ],
                             "foo_p": [
                                 [
-                                    12,
+                                    13,
                                     4
                                 ],
                                 [
-                                    12,
+                                    13,
                                     9
                                 ]
                             ]
@@ -198,11 +199,11 @@
                                 "q_name": "Foo.__init__.<locals>.foo_inner",
                                 "fn_lc": [
                                     [
-                                        15,
+                                        16,
                                         8
                                     ],
                                     [
-                                        16,
+                                        17,
                                         16
                                     ]
                                 ],
@@ -234,11 +235,11 @@
                                 "q_name": "Foo.__init__",
                                 "fn_lc": [
                                     [
-                                        13,
+                                        14,
                                         4
                                     ],
                                     [
-                                        16,
+                                        17,
                                         16
                                     ]
                                 ],
@@ -259,11 +260,11 @@
                                 "fn_var_ln": {
                                     "i": [
                                         [
-                                            14,
+                                            15,
                                             8
                                         ],
                                         [
-                                            14,
+                                            15,
                                             14
                                         ]
                                     ]
@@ -282,11 +283,11 @@
                                 "q_name": "Foo.foo_fn.<locals>.foo_inner",
                                 "fn_lc": [
                                     [
-                                        18,
+                                        19,
                                         8
                                     ],
                                     [
-                                        19,
+                                        20,
                                         16
                                     ]
                                 ],
@@ -330,11 +331,11 @@
                                 "q_name": "Foo.foo_fn",
                                 "fn_lc": [
                                     [
-                                        17,
+                                        18,
                                         4
                                     ],
                                     [
-                                        21,
+                                        22,
                                         16
                                     ]
                                 ],
@@ -370,11 +371,11 @@
                                 "fn_var_ln": {
                                     "d": [
                                         [
-                                            20,
+                                            21,
                                             8
                                         ],
                                         [
-                                            20,
+                                            21,
                                             9
                                         ]
                                     ]
@@ -394,11 +395,11 @@
                                 "q_name": "Foo.get_e",
                                 "fn_lc": [
                                     [
-                                        23,
+                                        24,
                                         4
                                     ],
                                     [
-                                        24,
+                                        25,
                                         24
                                     ]
                                 ],
@@ -429,11 +430,11 @@
                                 "q_name": "Foo.get_e",
                                 "fn_lc": [
                                     [
-                                        26,
+                                        27,
                                         4
                                     ],
                                     [
-                                        28,
+                                        29,
                                         24
                                     ]
                                 ],
@@ -474,11 +475,11 @@
                                 "fn_var_ln": {
                                     "foo_v": [
                                         [
-                                            27,
+                                            28,
                                             8
                                         ],
                                         [
-                                            27,
+                                            28,
                                             17
                                         ]
                                     ]
@@ -502,11 +503,11 @@
                         "q_name": "Bar",
                         "fn_lc": [
                             [
-                                30,
+                                31,
                                 0
                             ],
                             [
-                                33,
+                                34,
                                 12
                             ]
                         ],
@@ -557,21 +558,21 @@
                         "fn_var_ln": {
                             "v": [
                                 [
-                                    31,
+                                    32,
                                     4
                                 ],
                                 [
-                                    31,
+                                    32,
                                     5
                                 ]
                             ],
                             "l": [
                                 [
-                                    32,
+                                    33,
                                     4
                                 ],
                                 [
-                                    32,
+                                    33,
                                     5
                                 ]
                             ]
@@ -593,11 +594,11 @@
                     null
                 ],
                 "no_types_annot": {
-                    "U": 15,
-                    "D": 14,
+                    "U": 14,
+                    "D": 15,
                     "I": 0
                 },
-                "type_annot_cove": 0.48
+                "type_annot_cove": 0.52
             }
         }
     }
diff --git a/tests/test_type_apply.py b/tests/test_type_apply.py
index 6b23b63..8607de1 100644
--- a/tests/test_type_apply.py
+++ b/tests/test_type_apply.py
@@ -5,6 +5,7 @@
 import shutil
 
 test_file = """from pathlib import Path
+import pandas
 x: int = 12
 l = [(1, 2)]
 c = defaultdict(int)
@@ -41,12 +42,12 @@ def Bar(x=['apple', 'orange'], *, c):
 
 test_file_exp = """from typing import Tuple, Dict, List, Literal
 from collections import defaultdict
-import pandas
 import pathlib
 import builtins
 import collections
 import typing
 from pathlib import Path
+import pandas
 x: builtins.int = 12
 l: typing.List[typing.Tuple[builtins.int, builtins.int]] = [(1, 2)]
 c: collections.defaultdict = defaultdict(int)
@@ -74,7 +75,7 @@ def get_e(self):
     def get_e(self, y: builtins.str):
         Foo.foo_v = y
         return Foo.foo_v
-    foo_v: str = "No"
+    foo_v: builtins.str = "No"
 def Bar(x: typing.List[builtins.str]=['apple', 'orange'], *, c)-> typing.List[builtins.str]:
     v: typing.List[builtins.str] = x
     l = lambda e: e+1

From d37484863b9898a8ef03be8bca4d500001ae5c9f Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Mon, 2 Aug 2021 14:35:26 +0200
Subject: [PATCH 25/31] (1) Exclude source files in the ignored list for the
 main pipeline, (2) include type error categories by mypy in the JSON output

---
 libsa4py/__main__.py     |  4 ++-
 libsa4py/cst_pipeline.py | 64 ++++++++++++++++++++++++----------------
 libsa4py/type_check.py   |  8 ++---
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py
index 8dc283e..4eff86c 100644
--- a/libsa4py/__main__.py
+++ b/libsa4py/__main__.py
@@ -7,7 +7,8 @@
 
 def process_projects(args):
     input_repos = find_repos_list(args.p) if args.l is None else find_repos_list(args.p)[:args.l]
-    p = Pipeline(args.p, args.o, not args.no_nlp, args.use_cache, args.use_pyre, args.use_tc, args.d, args.s)
+    p = Pipeline(args.p, args.o, not args.no_nlp, args.use_cache, args.use_pyre, args.use_tc, args.d,
+                 args.s, args.i)
     p.run(input_repos, args.j)
 
 
@@ -31,6 +32,7 @@ def main():
     process_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects")
     process_parser.add_argument("--d", "--deduplicate", required=False, type=str, help="Path to duplicate files")
     process_parser.add_argument("--s", "--split", required=False, type=str, help="Path to the dataset split files")
+    process_parser.add_argument("--i", "--ignore", required=False, type=str, help="Path to the ignored files")
     process_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing projects")
     process_parser.add_argument("--l", required=False, type=int, help="Number of projects to process")
     process_parser.add_argument("--c", "--cache", dest='use_cache', action='store_true', help="Whether to ignore processed projects")
diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index ffa870d..4eaf82e 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -16,7 +16,7 @@
 from multiprocessing.queues import Queue
 from dpu_utils.utils.dataloading import load_jsonl_gz
 from libsa4py.cst_extractor import Extractor
-from libsa4py.cst_transformers import TypeApplier
+from libsa4py.cst_transformers import TypeAnnotationRemover, TypeApplier
 from libsa4py.exceptions import ParseError, NullProjectException
 from libsa4py.nl_preprocessing import NLPreprocessor
 from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \
@@ -38,7 +38,7 @@ class Pipeline:
 
     def __init__(self, projects_path, output_dir, nlp_transf: bool = True,
                  use_cache: bool = True, use_pyre: bool = False, use_tc: bool = False,
-                 dups_files_path=None, split_files_path=None):
+                 dups_files_path=None, split_files_path=None, ignored_files_path=None):
         self.projects_path = projects_path
         self.output_dir = output_dir
         self.processed_projects = None
@@ -60,6 +60,11 @@ def __init__(self, projects_path, output_dir, nlp_transf: bool = True,
         else:
             self.is_file_duplicate = lambda x: False
 
+        if ignored_files_path is not None:
+            self.ignored_files = set(read_file(ignored_files_path).splitlines())
+        else:
+            self.ignored_files = {}
+
         if self.use_tc:
             self.tc = MypyManager('mypy', MAX_TC_TIME)
 
@@ -162,6 +167,8 @@ def process_project(self, i, project):
             print(f"{project_id} has {len(project_files)} files before deduplication")
             project_files = [f for f in project_files if not self.is_file_duplicate(f)]
             print(f"{project_id} has {len(project_files)} files after deduplication")
+            project_files = [f for f in project_files if str(Path(f).relative_to(Path(self.projects_path).parent)) not in self.ignored_files]
+            print(f"{project_id} has {len(project_files)} files after ignoring files")
 
             project_files = [(f, str(Path(f).relative_to(Path(self.projects_path).parent))) for f in project_files]
             project_files = [(f, f_r, self.split_dataset_files[f_r] if f_r in self.split_dataset_files else None) for f,
@@ -323,6 +330,8 @@ class TypeAnnotationsRemoval:
     Removes type annotations that cannot be type-checked by mypy
     """
 
+    MAX_TYPE_ERRORS_PER_FILE = 500
+
     def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None,
                  dry_run: bool = False, apply_nlp: bool = True):
         self.projects_path = projects_path
@@ -349,12 +358,13 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict):
                     try:
                         tmp_f = create_tmp_file(".py")
                         f_read = read_file(join(self.projects_path, f))
-                        f_tc_code, tc_errs, type_annot_r = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
+                        f_tc_code, tc_errs, type_annot_r, tc_errors = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
                                                                                                 tmp_f)
                         print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
                             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}")
                         tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
-                                        "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D']}
+                                     "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'],
+                                     "errors": tc_errors}
                         # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
                         if not self.dry_run and tc_errs == 0:
                             write_file(join(self.projects_path, f), f_tc_code)
@@ -416,7 +426,7 @@ def __load_projects_files(self, q: Queue, is_done):
             proj_json = load_json(p_j)
             for _, p_v in proj_json.items():
                 for f, f_v in p_v['src_files'].items():
-                    if not f_v['tc'][0] and f_v['tc'] != [False, None] and f_v['tc'][1] <= 100:
+                    if not f_v['tc'][0] and f_v['tc'] != [False, None, None] and f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE:
                         q.put((f, f_v))
                         f_loaded += 1
                         #print("Adding files to Queue...")
@@ -431,7 +441,7 @@ def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_
         MAX_TRY = 10
 
         def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
-            tc, no_tc_err, f_code = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+            tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
             nonlocal no_try
             if no_tc_err is not None:
                 if tc:
@@ -440,13 +450,15 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     curr_f_code = f_code
                     curr_no_tc_err = no_tc_err
                     type_annots_removed.append(org_gt)
+                    no_try += 1
                 else:
                     org_gt_d = org_gt
                     no_try += 1
 
-            return tc, no_tc_err, f_code
+            return tc, no_tc_err, f_code, tc_errors
 
         out_f_code: str = ""
+        tc_errors = None
         for m_v, m_v_t in f_d_repr['variables'].items():
             if m_v_t != "":
                 print(f"Type-checking module-level variable {m_v} with annotation {m_v_t}")
@@ -461,10 +473,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(m_v_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['variables'][m_v] = m_v_t
-                tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, m_v_t,
+                tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, m_v_t,
                                                       f_d_repr['variables'][m_v])
                 if tc or no_try > MAX_TRY:
-                    return f_code, no_tc_err, type_annots_removed
+                    return f_code, no_tc_err, type_annots_removed, tc_errors
 
         for i, fn in enumerate(f_d_repr['funcs']):
             for p_n, p_t in fn['params'].items():
@@ -481,10 +493,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(p_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['params'][p_n] = p_t
-                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
+                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t,
                                                           f_d_repr['funcs'][i]['params'][p_n])
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed
+                        return f_code, no_tc_err, type_annots_removed, tc_errors
 
             for fn_v, fn_v_t in fn['variables'].items():
                 if fn_v_t != "":
@@ -500,10 +512,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(fn_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
-                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
+                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
                                                           f_d_repr['funcs'][i]['variables'][fn_v])
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed
+                        return f_code, no_tc_err, type_annots_removed, tc_errors
 
             # The return type for module-level functions
             if f_d_repr['funcs'][i]['ret_type'] != "":
@@ -520,10 +532,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(org_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['funcs'][i]['ret_type'] = org_t
-                tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
+                tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t,
                                                       f_d_repr['funcs'][i]['ret_type'])
                 if tc or no_try > MAX_TRY:
-                    return f_code, no_tc_err, type_annots_removed
+                    return f_code, no_tc_err, type_annots_removed, tc_errors
 
         # The type of class-level vars
         for c_i, c in enumerate(f_d_repr['classes']):
@@ -541,10 +553,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(c_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
-                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, c_v_t,
+                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, c_v_t,
                                                           f_d_repr['classes'][c_i]['variables'][c_v])
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed
+                        return f_code, no_tc_err, type_annots_removed, tc_errors
 
             # The type of arguments for class-level functions
             for fn_i, fn in enumerate(c['funcs']):
@@ -562,10 +574,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(p_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
-                        tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, p_t,
+                        tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t,
                                                               f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
                         if tc or no_try > MAX_TRY:
-                            return f_code, no_tc_err, type_annots_removed
+                            return f_code, no_tc_err, type_annots_removed, tc_errors
 
                 # The type of local variables for class-level functions
                 for fn_v, fn_v_t in fn['variables'].items():
@@ -582,10 +594,10 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(fn_v_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
-                        tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
+                        tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
                                                               f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
                         if tc or no_try > MAX_TRY:
-                            return f_code, no_tc_err, type_annots_removed
+                            return f_code, no_tc_err, type_annots_removed, tc_errors
 
                 # The return type for class-level functions
                 if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "":
@@ -603,16 +615,16 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(org_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
-                    tc, no_tc_err, f_code = type_check_ta(init_no_tc_err, out_f_code, org_t,
+                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t,
                                                           f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed
+                        return f_code, no_tc_err, type_annots_removed, tc_errors
 
-        return out_f_code, init_no_tc_err, type_annots_removed
+        return out_f_code, init_no_tc_err, type_annots_removed, tc_errors
 
     def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile):
         f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr,
                                                                                                apply_nlp=self.apply_nlp))
         write_to_tmp_file(out_f, f_t_applied.code)
-        tc, no_tc_err = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME))
-        return tc, no_tc_err, f_t_applied.code
+        tc, no_tc_err, tc_errors = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME))
+        return tc, no_tc_err, f_t_applied.code, tc_errors
diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py
index 40a9f19..770ac41 100644
--- a/libsa4py/type_check.py
+++ b/libsa4py/type_check.py
@@ -167,13 +167,13 @@ def _report_errors(self, parsed_result):
             print(f"Error breaking down: {parsed_result.err_breakdown}.")
 
 
-def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None]]:
+def type_check_single_file(f_path: str, tc: TCManager) -> Tuple[bool, Union[int, None], Union[dict, None]]:
     try:
         no_t_err = tc.heavy_assess(f_path)
         if no_t_err is not None:
-            return (True, 0) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs)
+            return (True, 0, no_t_err.err_breakdown) if no_t_err.no_type_errs == 0 else (False, no_t_err.no_type_errs, no_t_err.err_breakdown)
         else:
-            return False, None
+            return False, None, None
     except IndexError:
         print(f"f: {f_path} - No output from Mypy!")
-        return False, None
+        return False, None, None

From 729c76d4db822fba1ce4b6196abca7fef6aac2fa Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Fri, 6 Aug 2021 11:54:47 +0200
Subject: [PATCH 26/31] Putting large projects at the front of the jobs' queue
 to reduce overall processing time

---
 libsa4py/cst_pipeline.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 4cbe4c3..5eda048 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -137,7 +137,7 @@ def fn_nlp_transf(fn_d: dict, nlp_prep: NLPreprocessor):
 
         return extracted_module
 
-    def process_project(self, i, project):
+    def process_project(self, i, project, project_files: List[str]):
 
         project_id = f'{project["author"]}/{project["repo"]}'
         project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}}
@@ -148,7 +148,6 @@ def process_project(self, i, project):
             print(f'Extracting for {project_id}...')
             extracted_avl_types = None
 
-            project_files = list_files(join(self.projects_path, project["author"], project["repo"]))
             print(f"{project_id} has {len(project_files)} files before deduplication")
             project_files = [f for f in project_files if not self.is_file_duplicate(f)]
             print(f"{project_id} has {len(project_files)} files after deduplication")
@@ -233,12 +232,14 @@ def process_project(self, i, project):
     def run(self, repos_list: List[Dict], jobs, start=0):
 
         print(f"Number of projects to be processed: {len(repos_list)}")
-        repos_list = [p for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)]
+        repos_list = [(p, list_files(join(self.projects_path, p["author"], p["repo"]))) \
+                      for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)]
+        repos_list.sort(key=lambda x: len(x[1]), reverse=True)
         print(f"Number of projects to be processed after considering cache: {len(repos_list)}")
 
         start_t = time.time()
         ParallelExecutor(n_jobs=jobs)(total=len(repos_list))(
-            delayed(self.process_project)(i, project) for i, project in enumerate(repos_list, start=start))
+            delayed(self.process_project)(i, p, p_files) for i, (p, p_files) in enumerate(repos_list, start=start))
         print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t))))
 
         if self.use_pyre:

From 35539442f4195d251c1a3cf95efec9fa255b2c0a Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Fri, 6 Aug 2021 11:57:23 +0200
Subject: [PATCH 27/31] ignore type errors of imported modules and missing
 imports when type-checking by mypy

---
 libsa4py/type_check.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libsa4py/type_check.py b/libsa4py/type_check.py
index 770ac41..4aeaebe 100644
--- a/libsa4py/type_check.py
+++ b/libsa4py/type_check.py
@@ -126,7 +126,8 @@ def heavy_assess(self, fpath):
 class MypyManager(TCManager):
     def _build_tc_cmd(self, fpath):
         # Mypy needs a flag to display the error codes
-        return ["mypy", "--show-error-codes", "--no-incremental", "--cache-dir=/dev/null", fpath]
+        return ["mypy", "--show-error-codes", "--no-incremental", "--cache-dir=/dev/null",
+                "--follow-imports=silent", "--ignore-missing-imports", fpath]
 
     def _check_tc_outcome(self, _, outlines):
         if any(l.endswith(err) for l in outlines for err in self._inc_errcodes):

From 61b4b0cb78802d7ba7ea7dcc632221b5382e20ea Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Fri, 6 Aug 2021 12:01:03 +0200
Subject: [PATCH 28/31] Improvements to TypeApplier: (1) write ignored files to
 a separate file (2) fix try attempts for failed type-checking (3) type-check
 the original file rather than a temp file

---
 libsa4py/cst_pipeline.py | 71 +++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 4eaf82e..9cc52ea 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -342,7 +342,7 @@ def __init__(self, projects_path: str, processed_projects_path: str, output_path
         self.apply_nlp = apply_nlp
 
     #def process_file(self, f: str, f_d_repr: dict, tc_res: dict):
-    def process_file(self, q: Queue, is_f_loader_done, tc_res: dict):
+    def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files: list):
         # TODO: The initial type-checking should not be done after adding no. type errors to the representation later on.
         # init_tc, init_no_tc_err = type_check_single_file(join(self.projects_path, f),
         #                                                  MypyManager('mypy', MAX_TC_TIME))
@@ -356,23 +356,33 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict):
                 f, f_d_repr = q.get(True, 1)
                 if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
                     try:
-                        tmp_f = create_tmp_file(".py")
+                        #tmp_f = create_tmp_file(".py")
                         f_read = read_file(join(self.projects_path, f))
-                        f_tc_code, tc_errs, type_annot_r, tc_errors = self.__remove_unchecked_type_annot(f_read, f_d_repr, f_d_repr['tc'][1],
-                                                                                                tmp_f)
+                        _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.projects_path, f),
+                                                                                                       f_read, f_d_repr, f_d_repr['tc'][1])
                         print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
                             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}")
                         tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": tc_errs, "ta_rem": type_annot_r,
                                      "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'],
                                      "errors": tc_errors}
                         # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
-                        if not self.dry_run and tc_errs == 0:
-                            write_file(join(self.projects_path, f), f_tc_code)
+                        if tc_errs == 0:
+                            if self.dry_run:
+                                write_file(join(self.projects_path, f), f_read)
+                        else:
+                            write_file(join(self.projects_path, f), f_read)
+                            ignored_files.append(f)
                     except Exception as e:
-                        print(f"f: {f} | e: {e}")
+                        print(f"F: {f} | e: {e}")
                         traceback.print_exc()
-                    finally:
-                        delete_tmp_file(tmp_f)
+                    # finally:
+                    #     delete_tmp_file(tmp_f)
+                else:
+                    print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}")
+                    tc_res[f] = {"init_tc_errs": f_d_repr['tc'][1], "curr_tc_errs": f_d_repr['tc'][1], "ta_rem": None,
+                                 "total_ta": f_d_repr["no_types_annot"]['I'] + f_d_repr["no_types_annot"]['D'],
+                                 "errors": None}
+                    ignored_files.append(f)
             except queue.Empty as e:
                 print(f"Worker {os.getpid()} finished! Queue's empty!")
                 print(f"File loader working {is_f_loader_done.value} and queue size {q.qsize()}")
@@ -381,8 +391,9 @@ def run(self, jobs: int):
         manager = Manager()
         q = manager.Queue()
         is_f_loader_done = manager.Value('i', False)
+        ignored_files_a = manager.list()
         
-        file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done))
+        file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a))
         file_loader.start()
         #file_loader.join()
 
@@ -402,14 +413,14 @@ def run(self, jobs: int):
         time.sleep(5)
         start_t = time.time()
         tc_res = manager.dict()
+        ignored_files_b = manager.list()
         file_processors = []
         for j in range(jobs):
-            p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res))
+            p = Process(target=self.process_file, args=(q, is_f_loader_done, tc_res, ignored_files_b))
             p.daemon = True
             file_processors.append(p)
             p.start()
 
-        
         for p in file_processors:
             p.join()
         file_loader.join()
@@ -417,8 +428,9 @@ def run(self, jobs: int):
         #                                                          for f, f_d in not_tced_src_f)
         print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}")
         save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy())
+        write_file(join(self.processed_projects_path, 'ignored_files.txt'), '\n'.join(list(ignored_files_a) + list(ignored_files_b)))
         
-    def __load_projects_files(self, q: Queue, is_done):
+    def __load_projects_files(self, q: Queue, is_done, ignored_files: list):
         proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json')
         proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons
         f_loaded = 0
@@ -426,22 +438,28 @@ def __load_projects_files(self, q: Queue, is_done):
             proj_json = load_json(p_j)
             for _, p_v in proj_json.items():
                 for f, f_v in p_v['src_files'].items():
-                    if not f_v['tc'][0] and f_v['tc'] != [False, None, None] and f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE:
-                        q.put((f, f_v))
-                        f_loaded += 1
+                    if not f_v['tc'][0]:
+                        if f_v['tc'] != [False, None, None]:
+                            if f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE:
+                                q.put((f, f_v))
+                                f_loaded += 1
+                            else:
+                                ignored_files.append(f)
+                        else:
+                            ignored_files.append(f)
                         #print("Adding files to Queue...")
         is_done.value = True
         print(f"Loaded {f_loaded} Python files")
             
-    def __remove_unchecked_type_annot(self, f_read: str, f_d_repr: dict, init_no_tc_err: int,
-                                      f_out_temp: NamedTemporaryFile) -> Tuple[str, int, List[str]]:
+    def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict,
+                                    init_no_tc_err: int) -> Tuple[str, int, List[str]]:
 
         type_annots_removed: List[str] = []
         no_try = 0
         MAX_TRY = 10
 
         def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
-            tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_read, f_d_repr, f_out_temp)
+            tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_path, f_read, f_d_repr)
             nonlocal no_try
             if no_tc_err is not None:
                 if tc:
@@ -450,10 +468,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     curr_f_code = f_code
                     curr_no_tc_err = no_tc_err
                     type_annots_removed.append(org_gt)
-                    no_try += 1
                 else:
                     org_gt_d = org_gt
                     no_try += 1
+            else:
+                no_try += 1
 
             return tc, no_tc_err, f_code, tc_errors
 
@@ -622,9 +641,15 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
 
         return out_f_code, init_no_tc_err, type_annots_removed, tc_errors
 
-    def __type_check_type_annotation(self, f_read: str, f_d_repr: dict, out_f: NamedTemporaryFile):
+    def __type_check_type_annotation(self, f_path: str, f_read: str, f_d_repr: dict):
         f_t_applied = cst.metadata.MetadataWrapper(cst.parse_module(f_read)).visit(TypeApplier(f_d_repr,
                                                                                                apply_nlp=self.apply_nlp))
-        write_to_tmp_file(out_f, f_t_applied.code)
-        tc, no_tc_err, tc_errors = type_check_single_file(out_f.name, MypyManager('mypy', MAX_TC_TIME))
+        
+        # Writing applied code to temp files has an advantage which isolates the file and as a result, 
+        # type-checking may be successful for some failed cases with the original file
+        # tmp_f = create_tmp_file(".py")
+        # write_to_tmp_file(tmp_f, f_t_applied.code)
+        write_file(f_path, f_t_applied.code)
+        tc, no_tc_err, tc_errors = type_check_single_file(f_path, MypyManager('mypy', MAX_TC_TIME))
+        #delete_tmp_file(tmp_f)
         return tc, no_tc_err, f_t_applied.code, tc_errors

From e3ddcc25e4e86827e0588640c22aa0b0654b6b80 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Fri, 6 Aug 2021 13:57:12 +0200
Subject: [PATCH 29/31] In the main pipeline, sort projects based on total size
 of their files

---
 libsa4py/cst_pipeline.py | 9 +++++----
 libsa4py/merge.py        | 2 +-
 libsa4py/utils.py        | 8 +++++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index 5eda048..b645899 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -232,14 +232,15 @@ def process_project(self, i, project, project_files: List[str]):
     def run(self, repos_list: List[Dict], jobs, start=0):
 
         print(f"Number of projects to be processed: {len(repos_list)}")
-        repos_list = [(p, list_files(join(self.projects_path, p["author"], p["repo"]))) \
+        repos_list = [(p, *list_files(join(self.projects_path, p["author"], p["repo"]))) \
                       for p in repos_list if not (os.path.exists(self.get_project_filename(p)) and self.use_cache)]
-        repos_list.sort(key=lambda x: len(x[1]), reverse=True)
+        # Sorts projects based on total size of their files
+        repos_list.sort(key=lambda x: x[2], reverse=True)
         print(f"Number of projects to be processed after considering cache: {len(repos_list)}")
 
         start_t = time.time()
         ParallelExecutor(n_jobs=jobs)(total=len(repos_list))(
-            delayed(self.process_project)(i, p, p_files) for i, (p, p_files) in enumerate(repos_list, start=start))
+            delayed(self.process_project)(i, p, p_files) for i, (p, p_files, p_size) in enumerate(repos_list, start=start))
         print("Finished processing %d projects in %s " % (len(repos_list), str(timedelta(seconds=time.time()-start_t))))
 
         if self.use_pyre:
@@ -278,6 +279,6 @@ def process_project(self, proj_json_path: str):
                         print(f"Can't parsed file {f} in project {proj_json_path}", pse)
 
     def run(self, jobs: int):
-        proj_jsons = list_files(join(self.output_path, 'processed_projects'), '.json')
+        proj_jsons, _ = list_files(join(self.output_path, 'processed_projects'), '.json')
         proj_jsons.sort(key=lambda f: os.stat(f).st_size, reverse=True)
         ParallelExecutor(n_jobs=jobs)(total=len(proj_jsons))(delayed(self.process_project)(p_j) for p_j in proj_jsons)
diff --git a/libsa4py/merge.py b/libsa4py/merge.py
index c5932de..8f0e64c 100644
--- a/libsa4py/merge.py
+++ b/libsa4py/merge.py
@@ -137,6 +137,6 @@ def merge_projects(args):
     """
     Saves merged projects into a single JSON file and a Dataframe
     """
-    merged_jsons = merge_jsons_to_dict(list_files(join(args.o, 'processed_projects'), ".json"), args.l)
+    merged_jsons = merge_jsons_to_dict(list_files(join(args.o, 'processed_projects'), ".json")[0], args.l)
     save_json(join(args.o, 'merged_%s_projects.json' % (str(args.l) if args.l is not None else 'all')), merged_jsons)
     create_dataframe_fns(args.o, merged_jsons)
diff --git a/libsa4py/utils.py b/libsa4py/utils.py
index c247c40..c87931e 100644
--- a/libsa4py/utils.py
+++ b/libsa4py/utils.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Tuple
 from tqdm import tqdm
 from joblib import Parallel
 from os.path import join, isdir
@@ -54,18 +54,20 @@ def tmp(op_iter):
 #     return directory
 
 
-def list_files(directory: str, file_ext: str = ".py") -> list:
+def list_files(directory: str, file_ext: str = ".py") -> Tuple[list, int]:
     """
     List all files in the given directory (recursively)
     """
     filenames = []
+    dir_size = 0
 
     for root, dirs, files in os.walk(directory):
         for filename in files:
             if filename.endswith(file_ext):
                 filenames.append(os.path.join(root, filename))
+                dir_size += Path(os.path.join(root, filename)).stat().st_size
 
-    return filenames
+    return filenames, dir_size
 
 
 def read_file(filename: str) -> str:

From 715aea8adaccafc11e705fbbb073cbd0f5204b26 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Tue, 10 Aug 2021 16:07:35 +0200
Subject: [PATCH 30/31] Improvements to TypeRemover: (1) Copying input dataset
 to another dest. for analysis (2) preserve removed type annot. when type
 errors aren't resolved

---
 libsa4py/__main__.py     |  7 +--
 libsa4py/cst_pipeline.py | 94 ++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/libsa4py/__main__.py b/libsa4py/__main__.py
index 4eff86c..4bcd1e7 100644
--- a/libsa4py/__main__.py
+++ b/libsa4py/__main__.py
@@ -18,7 +18,7 @@ def apply_types_projects(args):
 
 
 def remove_err_type_annotations(args):
-    tar = TypeAnnotationsRemoval(args.p, args.o, "", args.l, args.dry_run)
+    tar = TypeAnnotationsRemoval(args.i, args.o, args.p, args.l, args.dry_run)
     tar.run(args.j)
 
 
@@ -65,8 +65,9 @@ def main():
     apply_parser.set_defaults(func=apply_types_projects)
 
     remove_parser = sub_parsers.add_parser('remove')
-    remove_parser.add_argument("--p", required=True, type=str, help="Path to Python projects")
-    remove_parser.add_argument("--o", required=True, type=str, help="Path to store JSON-based processed projects")
+    remove_parser.add_argument("--i", required=True, type=str, help="Path to input dataset")
+    remove_parser.add_argument("--o", required=True, type=str, help="Path to output dataset")
+    remove_parser.add_argument("--p", required=True, type=str, help="Path to JSON-formatted processed projects")
     remove_parser.add_argument("--j", default=cpu_count(), type=int, help="Number of workers for processing files")
     remove_parser.add_argument("--l", required=False, type=int, help="Number of projects to process")
     remove_parser.add_argument("--d", dest='dry_run', action='store_true',
diff --git a/libsa4py/cst_pipeline.py b/libsa4py/cst_pipeline.py
index a53a554..fd8d979 100644
--- a/libsa4py/cst_pipeline.py
+++ b/libsa4py/cst_pipeline.py
@@ -20,7 +20,7 @@
 from libsa4py.exceptions import ParseError, NullProjectException
 from libsa4py.nl_preprocessing import NLPreprocessor
 from libsa4py.utils import read_file, list_files, ParallelExecutor, mk_dir_not_exist, save_json, load_json, write_file, \
-    create_tmp_file, write_to_tmp_file, delete_tmp_file
+    create_tmp_file, write_to_tmp_file, delete_tmp_file, mk_dir_cp_file
 from libsa4py.pyre import pyre_server_init, pyre_query_types, pyre_server_shutdown, pyre_kill_all_servers, \
     clean_pyre_config
 from libsa4py.type_check import MypyManager, type_check_single_file
@@ -333,11 +333,11 @@ class TypeAnnotationsRemoval:
 
     MAX_TYPE_ERRORS_PER_FILE = 500
 
-    def __init__(self, projects_path: str, processed_projects_path: str, output_path: str, no_projects_limit: int = None,
+    def __init__(self, input_projects_path: str, output_projects_path: str, processed_projects_path: str, no_projects_limit: int = None,
                  dry_run: bool = False, apply_nlp: bool = True):
-        self.projects_path = projects_path
+        self.input_projects_path = input_projects_path
         self.processed_projects_path = processed_projects_path
-        self.output_path = output_path
+        self.output_projects_path = output_projects_path
         self.no_projects_limit = no_projects_limit
         self.dry_run = dry_run
         self.apply_nlp = apply_nlp
@@ -358,8 +358,8 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files:
                 if f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D'] > 0:
                     try:
                         #tmp_f = create_tmp_file(".py")
-                        f_read = read_file(join(self.projects_path, f))
-                        _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.projects_path, f),
+                        f_read = read_file(join(self.output_projects_path, f))
+                        _, tc_errs, type_annot_r, tc_errors = self.remove_unchecked_type_annot(join(self.output_projects_path, f),
                                                                                                        f_read, f_d_repr, f_d_repr['tc'][1])
                         print(f"F: {f} | init_tc_errors: {f_d_repr['tc'][1]} | tc_errors: {tc_errs} | ta_r: {type_annot_r} | \
                             total_ta: {f_d_repr['no_types_annot']['I'] + f_d_repr['no_types_annot']['D']} | Queue size: {q.qsize()}")
@@ -369,9 +369,9 @@ def process_file(self, q: Queue, is_f_loader_done, tc_res: dict, ignored_files:
                         # Path(join(self.output_path, Path(f).parent)).mkdir(parents=True, exist_ok=True)
                         if tc_errs == 0:
                             if self.dry_run:
-                                write_file(join(self.projects_path, f), f_read)
+                                write_file(join(self.output_projects_path, f), f_read)
                         else:
-                            write_file(join(self.projects_path, f), f_read)
+                            write_file(join(self.output_projects_path, f), f_read)
                             ignored_files.append(f)
                     except Exception as e:
                         print(f"F: {f} | e: {e}")
@@ -393,8 +393,10 @@ def run(self, jobs: int):
         q = manager.Queue()
         is_f_loader_done = manager.Value('i', False)
         ignored_files_a = manager.list()
+        type_checked_files = manager.list()
         
-        file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a))
+        file_loader = Process(target=self.__load_projects_files, args=(q, is_f_loader_done, ignored_files_a,
+                                                                       type_checked_files))
         file_loader.start()
         #file_loader.join()
 
@@ -430,9 +432,10 @@ def run(self, jobs: int):
         print(f"Finished fixing invalid types in {str(timedelta(seconds=time.time() - start_t))}")
         save_json(join(self.processed_projects_path, "tc_ta_results_new.json"), tc_res.copy())
         write_file(join(self.processed_projects_path, 'ignored_files.txt'), '\n'.join(list(ignored_files_a) + list(ignored_files_b)))
+        write_file(join(self.processed_projects_path, 'tced_files.txt'), '\n'.join(list(type_checked_files)))
         
-    def __load_projects_files(self, q: Queue, is_done, ignored_files: list):
-        proj_jsons = list_files(join(self.processed_projects_path, 'processed_projects'), '.json')
+    def __load_projects_files(self, q: Queue, is_done, ignored_files: list, type_checked_files: list):
+        proj_jsons, _ = list_files(join(self.processed_projects_path, 'processed_projects'), '.json')
         proj_jsons = proj_jsons[:self.no_projects_limit] if self.no_projects_limit is not None else proj_jsons
         f_loaded = 0
         for p_j in proj_jsons:
@@ -442,15 +445,24 @@ def __load_projects_files(self, q: Queue, is_done, ignored_files: list):
                     if not f_v['tc'][0]:
                         if f_v['tc'] != [False, None, None]:
                             if f_v['tc'][1] <= TypeAnnotationsRemoval.MAX_TYPE_ERRORS_PER_FILE:
+                                mk_dir_cp_file(join('/home/amir/data/MT4Py-pyre-apply', f), join(self.output_projects_path, f))
                                 q.put((f, f_v))
                                 f_loaded += 1
+                                print(f"Added file {f} to the analysis queue")
                             else:
                                 ignored_files.append(f)
                         else:
                             ignored_files.append(f)
+                    else:
+                        type_checked_files.append(f)
+                        
                         #print("Adding files to Queue...")
         is_done.value = True
         print(f"Loaded {f_loaded} Python files")
+
+        for f in type_checked_files:
+            mk_dir_cp_file(join(self.input_projects_path, f), join(self.output_projects_path, f))
+            print(f"Copied type-checked file: {f}")
             
     def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict,
                                     init_no_tc_err: int) -> Tuple[str, int, List[str]]:
@@ -459,18 +471,16 @@ def remove_unchecked_type_annot(self, f_path: str, f_read: str, f_d_repr: dict,
         no_try = 0
         MAX_TRY = 10
 
-        def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
+        def type_check_ta(curr_no_tc_err: int,  org_gt):
             tc, no_tc_err, f_code, tc_errors = self.__type_check_type_annotation(f_path, f_read, f_d_repr)
             nonlocal no_try
             if no_tc_err is not None:
                 if tc:
                     type_annots_removed.append(org_gt)
                 elif no_tc_err < curr_no_tc_err:
-                    curr_f_code = f_code
                     curr_no_tc_err = no_tc_err
                     type_annots_removed.append(org_gt)
                 else:
-                    org_gt_d = org_gt
                     no_try += 1
             else:
                 no_try += 1
@@ -493,10 +503,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(m_v_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['variables'][m_v] = m_v_t
-                tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, m_v_t,
-                                                      f_d_repr['variables'][m_v])
+                tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, m_v_t)
                 if tc or no_try > MAX_TRY:
-                    return f_code, no_tc_err, type_annots_removed, tc_errors
+                    return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                else:
+                    f_d_repr['variables'][m_v] = m_v_t
 
         for i, fn in enumerate(f_d_repr['funcs']):
             for p_n, p_t in fn['params'].items():
@@ -513,10 +524,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(p_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['params'][p_n] = p_t
-                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t,
-                                                          f_d_repr['funcs'][i]['params'][p_n])
+                    tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, p_t)
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed, tc_errors
+                        return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                    else:
+                        f_d_repr['funcs'][i]['params'][p_n] = p_t
 
             for fn_v, fn_v_t in fn['variables'].items():
                 if fn_v_t != "":
@@ -532,10 +544,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(fn_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
-                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
-                                                          f_d_repr['funcs'][i]['variables'][fn_v])
+                    tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, fn_v_t)
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed, tc_errors
+                        return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                    else:
+                        f_d_repr['funcs'][i]['variables'][fn_v] = fn_v_t
 
             # The return type for module-level functions
             if f_d_repr['funcs'][i]['ret_type'] != "":
@@ -552,10 +565,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                 #     type_annots_removed.append(org_t)
                 # elif no_tc_err == init_no_tc_err:
                 #     f_d_repr['funcs'][i]['ret_type'] = org_t
-                tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t,
-                                                      f_d_repr['funcs'][i]['ret_type'])
+                tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, org_t)
                 if tc or no_try > MAX_TRY:
-                    return f_code, no_tc_err, type_annots_removed, tc_errors
+                    return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                else:
+                    f_d_repr['funcs'][i]['ret_type'] = org_t
 
         # The type of class-level vars
         for c_i, c in enumerate(f_d_repr['classes']):
@@ -573,10 +587,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(c_v_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
-                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, c_v_t,
-                                                          f_d_repr['classes'][c_i]['variables'][c_v])
+                    tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, c_v_t)
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed, tc_errors
+                        return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                    else:
+                        f_d_repr['classes'][c_i]['variables'][c_v] = c_v_t
 
             # The type of arguments for class-level functions
             for fn_i, fn in enumerate(c['funcs']):
@@ -594,10 +609,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(p_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
-                        tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, p_t,
-                                                              f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n])
+                        tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, p_t)
                         if tc or no_try > MAX_TRY:
-                            return f_code, no_tc_err, type_annots_removed, tc_errors
+                            return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                        else:
+                            f_d_repr['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p_t
 
                 # The type of local variables for class-level functions
                 for fn_v, fn_v_t in fn['variables'].items():
@@ -614,10 +630,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                         #     type_annots_removed.append(fn_v_t)
                         # elif no_tc_err == init_no_tc_err:
                         #     f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
-                        tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, fn_v_t,
-                                                              f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v])
+                        tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, fn_v_t)
                         if tc or no_try > MAX_TRY:
-                            return f_code, no_tc_err, type_annots_removed, tc_errors
+                            return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                        else:
+                            f_d_repr['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = fn_v_t
 
                 # The return type for class-level functions
                 if f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] != "":
@@ -635,10 +652,11 @@ def type_check_ta(curr_no_tc_err: int, curr_f_code: str, org_gt, org_gt_d):
                     #     type_annots_removed.append(org_t)
                     # elif no_tc_err == init_no_tc_err:
                     #     f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
-                    tc, no_tc_err, f_code, tc_errors = type_check_ta(init_no_tc_err, out_f_code, org_t,
-                                                          f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'])
+                    tc, no_tc_err, out_f_code, tc_errors = type_check_ta(init_no_tc_err, org_t)
                     if tc or no_try > MAX_TRY:
-                        return f_code, no_tc_err, type_annots_removed, tc_errors
+                        return out_f_code, no_tc_err, type_annots_removed, tc_errors
+                    else:
+                        f_d_repr['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t
 
         return out_f_code, init_no_tc_err, type_annots_removed, tc_errors
 

From 060c4bef721d05a4a970279b0319f250d84e86b7 Mon Sep 17 00:00:00 2001
From: mir-am <mir-am@hotmail.com>
Date: Tue, 10 Aug 2021 16:08:48 +0200
Subject: [PATCH 31/31] Add a utility method to copy files while making
 required dirs

---
 libsa4py/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libsa4py/utils.py b/libsa4py/utils.py
index fdae75f..569f9f5 100644
--- a/libsa4py/utils.py
+++ b/libsa4py/utils.py
@@ -1,3 +1,4 @@
+import shutil
 from typing import List, Tuple
 from tqdm import tqdm
 from joblib import Parallel
@@ -82,6 +83,13 @@ def write_file(filename: str, content: str):
     with open(filename, 'w') as file:
         file.write(content)
 
+def mk_dir_cp_file(src_path: str, dest_path: str):
+    """
+    Creates directories in the destination if not exists and copy the given file
+    """
+    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+    shutil.copy(src_path, dest_path)
+
 def save_json(filename: str, dict_obj: dict):
     """
     Dumps a dict object into a JSON file