From 7611d796b037a07c8c11ff43bd56ef0b2a83b6b4 Mon Sep 17 00:00:00 2001 From: mir-am Date: Tue, 3 May 2022 16:49:16 +0200 Subject: [PATCH 1/2] Speed up `preprocess` using `parallel_appy()` --- type4py/preprocess.py | 88 +++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/type4py/preprocess.py b/type4py/preprocess.py index 21b619e..b76b666 100644 --- a/type4py/preprocess.py +++ b/type4py/preprocess.py @@ -3,13 +3,16 @@ from type4py import logger, AVAILABLE_TYPES_NUMBER, MAX_PARAM_TYPE_DEPTH, AVAILABLE_TYPE_APPLY_PROB from libsa4py.merge import merge_jsons_to_dict, create_dataframe_fns, create_dataframe_vars from libsa4py.cst_transformers import ParametricTypeDepthReducer +from libsa4py.cst_visitor import CountParametricTypeDepth from libsa4py.cst_lenient_parser import lenient_parse_module from libsa4py.utils import list_files +from libcst import parse_module, ParserSyntaxError from typing import Tuple from ast import literal_eval from collections import Counter from os.path import exists, join from tqdm import tqdm +from pandarallel import pandarallel import regex import os import pickle @@ -19,6 +22,7 @@ logger.name = __name__ tqdm.pandas() +pandarallel.initialize(progress_bar=True) # Precompile often used regex first_cap_regex = regex.compile('(.)([A-Z][a-z]+)') @@ -81,7 +85,7 @@ def resolve_type_alias(t: str): df_param['arg_type'] = df_param['arg_type'].progress_apply(resolve_type_alias) df_ret['return_type'] = df_ret['return_type'].progress_apply(resolve_type_alias) - df_vars['var_type'] = df_vars['var_type'].progress_apply(resolve_type_alias) + df_vars['var_type'] = df_vars['var_type'].parallel_apply(resolve_type_alias) return df_param, df_ret, df_vars @@ -90,11 +94,8 @@ def preprocess_parametric_types(df_param: pd.DataFrame, df_ret: pd.DataFrame, """ Reduces the depth of parametric types """ - from libcst import parse_module, ParserSyntaxError - global s - s = 0 + def reduce_depth_param_type(t: str) -> str: - global s if regex.match(r'.+\[.+\]', t): try: t = parse_module(t) @@ -107,14 +108,17 @@ def reduce_depth_param_type(t: str) -> str: s += 1 return t.code except ParserSyntaxError: - return None + return "" + except Exception: + return "" + except Exception: + return "" else: return t - df_param['arg_type'] = df_param['arg_type'].progress_apply(reduce_depth_param_type) - df_ret['return_type'] = df_ret['return_type'].progress_apply(reduce_depth_param_type) - df_vars['var_type'] = df_vars['var_type'].progress_apply(reduce_depth_param_type) - logger.info(f"Sucssesfull lenient parsing {s}") + df_param['arg_type'] = df_param['arg_type'].parallel_apply(reduce_depth_param_type) + df_ret['return_type'] = df_ret['return_type'].parallel_apply(reduce_depth_param_type) + df_vars['var_type'] = df_vars['var_type'].parallel_apply(reduce_depth_param_type) return df_param, df_ret, df_vars @@ -165,29 +169,31 @@ def gen_argument_df(df: pd.DataFrame) -> pd.DataFrame: :param df: dataframe for which to extract argument :return: argument dataframe """ - arguments = [] - for i, row in tqdm(df.iterrows(), total=len(df.index), desc="Processing arguments"): - for p_i, arg_name in enumerate(literal_eval(row['arg_names'])): - - # Ignore self arg - if arg_name == 'self': - continue - - arg_type = literal_eval(row['arg_types'])[p_i].strip('\"') - - # Ignore Any or None types - # TODO: Ignore also object type - # TODO: Ignore Optional[Any] - if arg_type == '' or arg_type in {'Any', 'None', 'object'}: - continue - - arg_descr = literal_eval(row['arg_descrs'])[p_i] - arg_occur = [a.replace('self', '').strip() if 'self' in a.split() else a for a in literal_eval(row['args_occur'])[p_i]] - other_args = " ".join([a for a in literal_eval(row['arg_names']) if a != 'self']) - arguments.append([row['file'], row['name'], row['func_descr'], arg_name, arg_type, arg_descr, other_args, arg_occur, row['aval_types']]) - - return pd.DataFrame(arguments, columns=['file', 'func_name', 'func_descr', 'arg_name', 'arg_type', 'arg_comment', 'other_args', - 'arg_occur', 'aval_types']) + from multiprocessing import Manager + with Manager() as m: + arguments = m.list() + def preprocess_arguments(row): + for p_i, arg_name in enumerate(literal_eval(row['arg_names'])): + # Ignore self arg + if arg_name == 'self': + continue + + arg_type = literal_eval(row['arg_types'])[p_i].strip('\"') + + # Ignore Any or None types + # TODO: Ignore also object type + # TODO: Ignore Optional[Any] + if arg_type == '' or arg_type in {'Any', 'None', 'object'}: + continue + + arg_descr = literal_eval(row['arg_descrs'])[p_i] + arg_occur = [a.replace('self', '').strip() if 'self' in a.split() else a for a in literal_eval(row['args_occur'])[p_i]] + other_args = " ".join([a for a in literal_eval(row['arg_names']) if a != 'self']) + arguments.append([row['file'], row['name'], row['func_descr'], arg_name, arg_type, arg_descr, other_args, arg_occur, row['aval_types']]) + + df.parallel_apply(preprocess_arguments, axis=1) + return pd.DataFrame(list(arguments), columns=['file', 'func_name', 'func_descr', 'arg_name', 'arg_type', 'arg_comment', 'other_args', + 'arg_occur', 'aval_types']) def filter_return_dp(df: pd.DataFrame) -> pd.DataFrame: """ @@ -292,6 +298,19 @@ def trans_aval_type(x): return df_param, df_ret +def sanity_check_param_types(df_param: pd.DataFrame, df_ret: pd.DataFrame, df_vars: pd.DataFrame, max_depth: int=2): + """ + A sanity-check for the depth of parametric types. + """ + def count_param_type_depth(param_type: str) -> int: + cptd_visitor = CountParametricTypeDepth() + parse_module(param_type).visit(cptd_visitor) + return cptd_visitor.type_annot_depth + + assert (df_param['arg_type'].apply(count_param_type_depth) > max_depth).any() == False + assert [df_ret['return_type'].apply(count_param_type_depth) > max_depth].any() == False + assert [df_vars['var_type'].apply(count_param_type_depth) > max_depth].any() == False + def preprocess_ext_fns(output_dir: str, limit: int = None, apply_random_vth: bool = False): """ Applies preprocessing steps to the extracted functions @@ -364,6 +383,7 @@ def preprocess_ext_fns(output_dir: str, limit: int = None, apply_random_vth: boo processed_proj_fns = filter_functions(processed_proj_fns) # Extracts type hints for functions' arguments + logger.info("Preprocessing functions' arguemnts") processed_proj_fns_params = gen_argument_df(processed_proj_fns) # Filters out functions: (1) without a return type (2) with the return type of Any or None (3) without a return expression @@ -383,6 +403,8 @@ def preprocess_ext_fns(output_dir: str, limit: int = None, apply_random_vth: boo processed_proj_fns_params, processed_proj_fns, processed_proj_vars = preprocess_parametric_types(processed_proj_fns_params, processed_proj_fns, processed_proj_vars) + #sanity_check_param_types(processed_proj_fns_params, processed_proj_fns, processed_proj_vars, MAX_PARAM_TYPE_DEPTH) + # Exclude variables without a type processed_proj_vars = filter_var_wo_type(processed_proj_vars) From 8a05a285c4321f2ae7990066eaf3ae75ee83916a Mon Sep 17 00:00:00 2001 From: mir-am Date: Tue, 3 May 2022 16:55:15 +0200 Subject: [PATCH 2/2] Add `pandarallel` to the requirements file --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 12931ce..933ea74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ annoy>=1.17.0 torch libsa4py onnx -onnxruntime \ No newline at end of file +onnxruntime +pandarallel \ No newline at end of file