diff --git a/.flake8 b/.flake8 index 883b5c3..1706441 100644 --- a/.flake8 +++ b/.flake8 @@ -2,4 +2,4 @@ exclude = .git,venv max-line-length = 100 extend-ignore = E203 -per-file-ignores = __init__.py:F401 \ No newline at end of file +per-file-ignores = __init__.py:F401 diff --git a/.gitignore b/.gitignore index 2739c35..5bd665a 100644 --- a/.gitignore +++ b/.gitignore @@ -166,4 +166,4 @@ cython_debug/ .vscode/settings.json # pdm -.pdm-python \ No newline at end of file +.pdm-python diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b2fc0e9..fce2f4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,4 +49,4 @@ repos: entry: pytest language: system pass_filenames: false - always_run: true \ No newline at end of file + always_run: true diff --git a/LICENSE b/LICENSE index 9fbba06..72d4c76 100644 --- a/LICENSE +++ b/LICENSE @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/README.md b/README.md index 14ac858..733c8ab 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ $ pip install -e . ## User guide -The cleaning and pre-processing has been wrapped in the function `initialise()`. +The cleaning and pre-processing has been wrapped in the function `initialise()`. ```python from rama import initialise @@ -64,8 +64,8 @@ from rama import initialise_humans graph, connected_components, graphs_with_humans = initialise_humans(path, psc_filename, companies_filename, string_nature) ``` -** Coming soon ** -A collection of notebooks can be found in `notebooks/` with a series of quick and simple tutorial on how to analyse the processed data. +** Coming soon ** +A collection of notebooks can be found in `notebooks/` with a series of quick and simple tutorial on how to analyse the processed data. ## Contact diff --git a/pyproject.toml b/pyproject.toml index 4f3710b..c77ba61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,4 +62,4 @@ dev = [ "pyroma>=4.2", "pydocstyle>=6.3.0", "grip>=4.6.2", -] \ No newline at end of file +] diff --git a/src/rama/analysing/differential_evolution.py b/src/rama/analysing/differential_evolution.py index 4fe86d0..a628b27 100644 --- a/src/rama/analysing/differential_evolution.py +++ b/src/rama/analysing/differential_evolution.py @@ -1,9 +1,9 @@ import random -import networkx as nx + import numpy as np from tqdm import tqdm -from rama.src.rama.analysing.transfer_money import loss_function +from rama.analysing.transfer_money import loss_function def make_profit_dict(profit_distribution, nodes_with_profit, nodes_without_profit): diff --git a/src/rama/analysing/swapping.py b/src/rama/analysing/swapping.py index 0b2b350..b0dc218 100644 --- a/src/rama/analysing/swapping.py +++ b/src/rama/analysing/swapping.py @@ -1,13 +1,15 @@ import random +from collections import deque + import networkx as nx import numpy as np -#### Restrictions #### +# Restrictions # First restriction -def no_cycles(subgraph): +def no_cycles(subgraph: nx.DiGraph) -> bool: """Function to check that a graph does not have cycles""" subgraph_undirected = subgraph.to_undirected() list_cycles = nx.cycle_basis(subgraph_undirected) @@ -15,7 +17,7 @@ def no_cycles(subgraph): # Second restriction -def connected(subgraph): +def connected(subgraph: nx.DiGraph) -> bool: """Function to check that graph stays as a single connected component""" connected_components = nx.weakly_connected_components(subgraph) list_ccs = list(connected_components) @@ -23,43 +25,36 @@ def connected(subgraph): # Third restriction -def no_more_than_two_per(subgraph, limit=2): +def no_more_than_two_per(subgraph: nx.DiGraph, limit: int = 2) -> bool: """Function to check that a company does not contain more than two human owners""" - indegrees = [ - subgraph.in_degree(node) - for node in subgraph.nodes - if subgraph.nodes[node]["human"] - ] - indegrees = np.array(indegrees) + indegrees = np.array( + [subgraph.in_degree(node) for node in subgraph.nodes if subgraph.nodes[node]["human"]] + ) return not any(indegrees > limit) # Fourth restriction -def only_human_roots(subgraph): +def only_human_roots(subgraph: nx.DiGraph) -> bool: """Function to check that a graph only contains human roots""" is_human = [ - subgraph.nodes[node]["human"] - for node in subgraph.nodes - if subgraph.out_degree(node) == 0 + subgraph.nodes[node]["human"] for node in subgraph.nodes if subgraph.out_degree(node) == 0 ] return sum(is_human) == len(is_human) # Fifth restriction -def no_slavery(subgraph): +def no_slavery(subgraph: nx.DiGraph) -> bool: """Function to check that a human cannot own share of another human""" degrees = sum( - subgraph.out_degree(node) - for node in subgraph.nodes - if subgraph.nodes[node]["human"] + subgraph.out_degree(node) for node in subgraph.nodes if subgraph.nodes[node]["human"] ) return degrees == 0 -#### Alterations #### +# Alterations -def one_swap(subgraph, change="origin"): +def one_swap(subgraph: nx.DiGraph, change: str = "origin") -> nx.DiGraph: """ Function to swap an edge in a given graph. By default, we are only changing the source of a link. @@ -97,7 +92,7 @@ def one_swap(subgraph, change="origin"): return subgraph_copy -def check_if_subgraph_passes(subgraph, checks): +def check_if_subgraph_passes(subgraph: nx.DiGraph, checks: deque) -> bool: """Function to check if a subgraph passes the given restrictions""" passed = [] for func in checks: @@ -108,7 +103,9 @@ def check_if_subgraph_passes(subgraph, checks): return sum_ == len(checks) -def get_swapped_subgraph(subgraph, checks, n_tries=100, change="random"): +def get_swapped_subgraph( + subgraph: nx.DiGraph, checks: deque, n_tries: int = 100, change: str = "random" +) -> nx.DiGraph | None: """Function that returns a subgraph with an edge swap passing all the checks""" n_try = 0 passing = False @@ -120,3 +117,4 @@ def get_swapped_subgraph(subgraph, checks, n_tries=100, change="random"): return None if passing: return swapped_subgraph + return None diff --git a/src/rama/analysing/transfer_money.py b/src/rama/analysing/transfer_money.py index 35e7964..c7e51b4 100644 --- a/src/rama/analysing/transfer_money.py +++ b/src/rama/analysing/transfer_money.py @@ -1,5 +1,5 @@ import networkx as nx -import numpy as np + dictionary_taxes = dict( zip( @@ -72,9 +72,7 @@ def give_dividends(graph, node, profits): wealth = profits[node] if len(in_hood) != 0: - in_weights = sum( - graph.edges[(neighbour, node)]["weight"] for neighbour in in_hood - ) + in_weights = sum(graph.edges[(neighbour, node)]["weight"] for neighbour in in_hood) else: in_weights = 0 @@ -84,12 +82,9 @@ def give_dividends(graph, node, profits): def recursive_wrapper(graph, profits): """Recursive wrapper""" dummy_dict = { - node: get_dividend_from_neighbours(graph, node, profits) - for node in graph.nodes() - } - return_dict = { - node: give_dividends(graph, node, dummy_dict) for node in graph.nodes() + node: get_dividend_from_neighbours(graph, node, profits) for node in graph.nodes() } + return_dict = {node: give_dividends(graph, node, dummy_dict) for node in graph.nodes()} return return_dict @@ -99,9 +94,7 @@ def theoretical_wealth(graph, node, profits): for node2 in list(nx.descendants(graph, node)): path = nx.shortest_path(graph, node, node2) - local_wealth = compose_function( - taxes, len(path) - 1, profits[node2], human=False - ) + local_wealth = compose_function(taxes, len(path) - 1, profits[node2], human=False) for i in range(len(path) - 1): local_wealth *= graph.edges[(path[i], path[i + 1])]["weight"] wealth += local_wealth @@ -114,7 +107,5 @@ def theoretical_wealth(graph, node, profits): def theoretical_wrapper(graph, profits): """Theoretical wrapper""" - return_dict = { - node: theoretical_wealth(graph, node, profits) for node in graph.nodes() - } + return_dict = {node: theoretical_wealth(graph, node, profits) for node in graph.nodes()} return return_dict diff --git a/src/rama/processing/cleaning.py b/src/rama/processing/cleaning.py index 8a4d320..67699da 100644 --- a/src/rama/processing/cleaning.py +++ b/src/rama/processing/cleaning.py @@ -1,8 +1,8 @@ from typing import Sequence -import pandas as pd +import pandas as pd -from rama.src.rama.processing.helper_functions import fill_company_number +from rama.processing.helper_functions import fill_company_number def clean_psc( diff --git a/src/rama/processing/helper_functions.py b/src/rama/processing/helper_functions.py index a2ed4e0..2f39430 100644 --- a/src/rama/processing/helper_functions.py +++ b/src/rama/processing/helper_functions.py @@ -1,8 +1,8 @@ -from typing import Sequence import os -import pandas as pd +from typing import Sequence + import numpy as np -from tqdm import tqdm +import pandas as pd def check_dir_exists(path: str) -> None: @@ -22,7 +22,7 @@ def check_dir_exists(path: str) -> None: def get_mutual_company_numbers(psc: pd.DataFrame, companies: pd.DataFrame) -> np.ndarray: """Function to match company numbers from PSCs and companies""" company_numbers_psc = psc.company_number.values - if "CompanyNumer" in companies.columns: + if "CompanyNumber" in companies.columns: mutual_company_numbers = companies.loc[ companies.CompanyNumber.isin(company_numbers_psc) ].CompanyNumber.values @@ -120,6 +120,10 @@ def get_company_company_link( psc_companies = psc_companies.dropna(subset=["company_name"]) # Fill owned companies that have already been indexed as owners of other companies + if "company_number" not in companies.columns: + companies = companies.rename(columns={"CompanyNumber": "company_number"}) + if "company_name" not in companies.columns: + companies = companies.rename(columns={"CompanyName": "company_name"}) names_owned = companies.loc[ companies.company_number.isin(psc_companies.company_number), ["company_number", "company_name"], @@ -131,7 +135,7 @@ def get_company_company_link( .reset_index() ) - ### 1 - Index Owners already indexed + # 1 - Index Owners already indexed already_indexed_owners = small_firstlink[ small_firstlink.company_name.isin(psc_companies.company_name.unique()) ][["company_name", "idx_company"]].drop_duplicates() @@ -142,7 +146,7 @@ def get_company_company_link( .reset_index() ) - ### 2 - Index Owners not seen before + # 2 - Index Owners not seen before # do not take into account those that appear in company_names_2 idxs_nan = psc_companies.idx_company.isna() @@ -167,8 +171,8 @@ def get_company_company_link( psc_companies.groupby("company_name")["idx_company"].transform("first"), inplace=True ) - #### Second companies - ### 1 - Index Owneds already indexed by company number + # Second companies + # 1 - Index Owneds already indexed by company number already_indexed_owneds_number = small_firstlink[ small_firstlink.company_number.isin(psc_companies.company_number.unique()) ][["company_number", "idx_company"]].drop_duplicates() @@ -180,7 +184,7 @@ def get_company_company_link( .reset_index() ) - ### 2 - Index Owneds already indexed in column 1 by name + # 2 - Index Owneds already indexed in column 1 by name # Fill owned companies that have already been indexed as owners of other companies unique_names_owned = names_owned.company_name.unique() min_ = max_ @@ -218,8 +222,8 @@ def get_list_unique_natures_of_control(psc: pd.DataFrame) -> list: list_unique_natures = [] for _, list_str in enumerate(natures): if list_str != "": - l = eval(list_str) - for element in l: + eval_list = eval(list_str) + for element in eval_list: list_unique_natures.append(element) list_unique_natures = np.unique(np.array(list_unique_natures)) diff --git a/src/rama/processing/initialise_db.py b/src/rama/processing/initialise_db.py index 7c6a737..27b73a6 100644 --- a/src/rama/processing/initialise_db.py +++ b/src/rama/processing/initialise_db.py @@ -1,10 +1,11 @@ from typing import Sequence -import pandas as pd + import networkx as nx +import pandas as pd -from rama.src.rama.processing.load_database_pipeline import process_database, get_graph -from rama.src.rama.processing.node_attributes import set_attributes -from rama.src.rama.processing.study_graphs import get_dict_cluster, classify_cluster +from rama.processing.load_database_pipeline import get_graph, process_database +from rama.processing.node_attributes import set_attributes +from rama.processing.study_graphs import classify_cluster, get_dict_cluster def initialise( @@ -31,18 +32,18 @@ def initialise( merged_firstlink = list_dfs[1] psc_companies = list_dfs[2] - ### Get graph + # Get graph graph = get_graph(edge_list) - ### Set attributes + # Set attributes set_attributes(graph, merged_firstlink, psc_companies, companies) - ### Connected components + # Connected components connected_components = list( sorted(nx.weakly_connected_components(graph), key=len, reverse=True) ) - ### Set attributes to connected components + # Set attributes to connected components dict_cluster = {} for number_of_cluster, set_nodes in enumerate(connected_components): dict_cluster_unclassified = get_dict_cluster(graph, list(set_nodes)) @@ -61,7 +62,7 @@ def initialise_humans( graph, connected_components, dict_cluster = initialise( path, psc_filenames, companies_filenames, string_ownership ) - ### get indices where there are humans + # get indices where there are humans graphs_with_humans = [ i for i in range(len(connected_components)) diff --git a/src/rama/processing/lists.py b/src/rama/processing/lists.py index cbbfded..d730d9a 100644 --- a/src/rama/processing/lists.py +++ b/src/rama/processing/lists.py @@ -1,118 +1,121 @@ -human_kinds = ['individual-person-with-significant-control', - 'individual-beneficial-owner', - ] +human_kinds = [ + "individual-person-with-significant-control", + "individual-beneficial-owner", +] -company_kinds = ['corporate-entity-person-with-significant-control', - 'corporate-entity-beneficial-owner', - 'legal-person-person-with-significant-control', - ] +company_kinds = [ + "corporate-entity-person-with-significant-control", + "corporate-entity-beneficial-owner", + "legal-person-person-with-significant-control", +] -other_kinds = ['super-secure-person-with-significant-control'] +other_kinds = ["super-secure-person-with-significant-control"] types_of_ownership = [ - 'ownership-of-shares-25-to-50-percent', - 'ownership-of-shares-25-to-50-percent-as-firm', - 'ownership-of-shares-25-to-50-percent-as-trust', - 'ownership-of-shares-50-to-75-percent', - 'ownership-of-shares-50-to-75-percent-as-firm', - 'ownership-of-shares-50-to-75-percent-as-trust', - 'ownership-of-shares-75-to-100-percent', - 'ownership-of-shares-75-to-100-percent-as-firm', - 'ownership-of-shares-75-to-100-percent-as-trust', - 'ownership-of-shares-more-than-25-percent-registered-overseas-entity', + "ownership-of-shares-25-to-50-percent", + "ownership-of-shares-25-to-50-percent-as-firm", + "ownership-of-shares-25-to-50-percent-as-trust", + "ownership-of-shares-50-to-75-percent", + "ownership-of-shares-50-to-75-percent-as-firm", + "ownership-of-shares-50-to-75-percent-as-trust", + "ownership-of-shares-75-to-100-percent", + "ownership-of-shares-75-to-100-percent-as-firm", + "ownership-of-shares-75-to-100-percent-as-trust", + "ownership-of-shares-more-than-25-percent-registered-overseas-entity", ] -psc_columns = ['natures_of_control', - 'ceased_on', - 'name', - 'country_of_residence', - 'kind', - 'nationality', - 'notified_on', - 'date_of_birth.year', - 'date_of_birth.month', - 'name_elements.surname', - 'name_elements.forename', - 'name_elements.title', - 'name_elements.middle_name', - 'company_number', - 'identification.registration_number', - 'ceased', - 'address.country', - 'address.postal_code', - ] +psc_columns = [ + "natures_of_control", + "ceased_on", + "name", + "country_of_residence", + "kind", + "nationality", + "notified_on", + "date_of_birth.year", + "date_of_birth.month", + "name_elements.surname", + "name_elements.forename", + "name_elements.title", + "name_elements.middle_name", + "company_number", + "identification.registration_number", + "ceased", + "address.country", + "address.postal_code", +] psc_columns_rename = { - 'name': 'CompanyName', - 'date_of_birth.year': 'date_of_birth_year', - 'date_of_birth.month': 'date_of_birth_month', - 'name_elements.surname': 'name_surname', - 'name_elements.forename': 'name_forename', - 'name_elements.title': 'name_title', - 'name_elements.middle_name': 'name_middle_name', - 'identification.registration_number': 'registration_number', + "name": "CompanyName", + "date_of_birth.year": "date_of_birth_year", + "date_of_birth.month": "date_of_birth_month", + "name_elements.surname": "name_surname", + "name_elements.forename": "name_forename", + "name_elements.title": "name_title", + "name_elements.middle_name": "name_middle_name", + "identification.registration_number": "registration_number", } -companies_columns = ['CompanyName', - 'CompanyNumber', - 'CompanyCategory', - 'CompanyStatus', - 'CountryOfOrigin', - 'DissolutionDate', - 'IncorporationDate', - 'PreviousName_1.CONDATE', - 'PreviousName_1.CompanyName', - 'PreviousName_2.CONDATE', - 'PreviousName_2.CompanyName', - 'PreviousName_3.CONDATE', - 'PreviousName_3.CompanyName', - 'PreviousName_4.CONDATE', - 'PreviousName_4.CompanyName', - 'PreviousName_5.CONDATE', - 'PreviousName_5.CompanyName', - 'PreviousName_6.CONDATE', - 'PreviousName_6.CompanyName', - 'PreviousName_7.CONDATE', - 'PreviousName_7.CompanyName', - 'PreviousName_8.CONDATE', - 'PreviousName_8.CompanyName', - 'PreviousName_9.CONDATE', - 'PreviousName_9.CompanyName', - 'PreviousName_10.CONDATE', - 'PreviousName_10.CompanyName' - ] +companies_columns = [ + "CompanyName", + "CompanyNumber", + "CompanyCategory", + "CompanyStatus", + "CountryOfOrigin", + "DissolutionDate", + "IncorporationDate", + "PreviousName_1.CONDATE", + "PreviousName_1.CompanyName", + "PreviousName_2.CONDATE", + "PreviousName_2.CompanyName", + "PreviousName_3.CONDATE", + "PreviousName_3.CompanyName", + "PreviousName_4.CONDATE", + "PreviousName_4.CompanyName", + "PreviousName_5.CONDATE", + "PreviousName_5.CompanyName", + "PreviousName_6.CONDATE", + "PreviousName_6.CompanyName", + "PreviousName_7.CONDATE", + "PreviousName_7.CompanyName", + "PreviousName_8.CONDATE", + "PreviousName_8.CompanyName", + "PreviousName_9.CONDATE", + "PreviousName_9.CompanyName", + "PreviousName_10.CONDATE", + "PreviousName_10.CompanyName", +] companies_columns_rename = { - 'PreviousName_1.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_1.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_2.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_2.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_3.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_3.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_4.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_4.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_5.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_5.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_6.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_6.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_7.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_7.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_8.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_8.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_9.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_9.CompanyName': 'PreviousName_1_CompanyName', - 'PreviousName_10.CONDATE': 'PreviousName_1_CONDATE', - 'PreviousName_10.CompanyName': 'PreviousName_1_CompanyName', - } + "PreviousName_1.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_1.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_2.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_2.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_3.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_3.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_4.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_4.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_5.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_5.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_6.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_6.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_7.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_7.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_8.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_8.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_9.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_9.CompanyName": "PreviousName_1_CompanyName", + "PreviousName_10.CONDATE": "PreviousName_1_CONDATE", + "PreviousName_10.CompanyName": "PreviousName_1_CompanyName", +} natures_patterns_str = [ - 'ownership-of-shares', - 'part-right-to-share-surplus-assets', - 'right-to-appoint-and-remove-directors', - 'right-to-appoint-and-remove-members', - 'right-to-appoint-and-remove-person', - 'right-to-share-surplus-assets', - 'significant-influence-or-control', - 'voting-rights', - ] - + "ownership-of-shares", + "part-right-to-share-surplus-assets", + "right-to-appoint-and-remove-directors", + "right-to-appoint-and-remove-members", + "right-to-appoint-and-remove-person", + "right-to-share-surplus-assets", + "significant-influence-or-control", + "voting-rights", +] diff --git a/src/rama/processing/load_database_pipeline.py b/src/rama/processing/load_database_pipeline.py index c48a2e7..901d51a 100644 --- a/src/rama/processing/load_database_pipeline.py +++ b/src/rama/processing/load_database_pipeline.py @@ -1,21 +1,22 @@ from typing import Sequence + +import networkx as nx import numpy as np import pandas as pd -import networkx as nx -from rama.src.rama.processing.helper_functions import ( - get_mutual_company_numbers, - get_human_company_links, +from rama.processing.cleaning import clean_companies, clean_psc +from rama.processing.helper_functions import ( get_company_company_link, + get_human_company_links, + get_mutual_company_numbers, ) -from rama.src.rama.processing.lists import ( - psc_columns, - human_kinds, +from rama.processing.lists import ( companies_columns, company_kinds, + human_kinds, other_kinds, + psc_columns, ) -from rama.src.rama.processing.cleaning import clean_psc, clean_companies def process_database( diff --git a/src/rama/processing/node_attributes.py b/src/rama/processing/node_attributes.py index c095d27..ac563e5 100644 --- a/src/rama/processing/node_attributes.py +++ b/src/rama/processing/node_attributes.py @@ -1,10 +1,11 @@ -import pandas as pd -import numpy as np import networkx as nx +import numpy as np +import pandas as pd -from rama.src.rama.processing.lists import types_of_ownership +from rama.processing.lists import types_of_ownership -########## Main function ############## + +# Main function def set_attributes( @@ -46,7 +47,7 @@ def set_attributes( nx.set_node_attributes(graph, dict_outdegree, "out_degree") -## Auxiliary functions ## +# Auxiliary functions def get_company_numbers( psc_companies: pd.DataFrame, df_attr: pd.DataFrame, companies: pd.DataFrame ) -> pd.DataFrame: @@ -104,8 +105,7 @@ def get_dict_from_attr_for_companies( set_nodes = set(sorted(list(graph.nodes()))) set_index = set(df_attr.index.values) - indices_non_appearing = set_index.symmetric_difference(set_nodes) - indices_non_appearing = list(indices_non_appearing) + indices_non_appearing = list(set_index.symmetric_difference(set_nodes)) df_non = pd.DataFrame(index=indices_non_appearing) df_non[attr] = np.nan @@ -134,8 +134,7 @@ def attr_kind( set_nodes = set(sorted(list(graph.nodes()))) set_index = set(df_attr.index.values) - indices_non_appearing = set_index.symmetric_difference(set_nodes) - indices_non_appearing = list(indices_non_appearing) + indices_non_appearing = list(set_index.symmetric_difference(set_nodes)) df_non = pd.DataFrame(index=indices_non_appearing) df_non[attr] = np.nan @@ -147,7 +146,7 @@ def attr_kind( return {} -### Attribute dictionaries ### +# Attribute dictionaries def attr_human(graph: nx.DiGraph, merged_firstlink: pd.DataFrame) -> dict: """Returns a dictionary which answers the question 'is the node a human?'. The dictionary is expected to be passed to nx.set_node_attributes()""" @@ -211,8 +210,7 @@ def attr_name( set_nodes = set(sorted(list(graph.nodes()))) set_index = set(df_attr.index.values) - indices_non_appearing = set_index.symmetric_difference(set_nodes) - indices_non_appearing = list(indices_non_appearing) + indices_non_appearing = list(set_index.symmetric_difference(set_nodes)) df_non = pd.DataFrame(index=indices_non_appearing) df_non[attr] = np.nan @@ -249,8 +247,7 @@ def attr_company_numbers( set_nodes = set(sorted(list(graph.nodes()))) set_index = set(df_attr.index.values) - indices_non_appearing = set_index.symmetric_difference(set_nodes) - indices_non_appearing = list(indices_non_appearing) + indices_non_appearing = list(set_index.symmetric_difference(set_nodes)) df_non = pd.DataFrame(index=indices_non_appearing) df_non[attr] = np.nan @@ -396,8 +393,7 @@ def attr_address( set_nodes = set(sorted(list(graph.nodes()))) set_index = set(addresses.index.unique()) - indices_non_appearing = set_index.symmetric_difference(set_nodes) - indices_non_appearing = list(indices_non_appearing) + indices_non_appearing = list(set_index.symmetric_difference(set_nodes)) df_non = pd.DataFrame(index=indices_non_appearing) df_non["country"] = np.nan @@ -410,7 +406,6 @@ def attr_address( dict_country = {} for node in df_country.index.unique(): - country = df_country.loc[node, "country"] if isinstance(country, pd.Series): list_countries = country.unique() @@ -427,7 +422,6 @@ def attr_address( dict_postal_code = {} for node in df_postal_code.index.unique(): - postal_code = df_postal_code.loc[node, "postal_code"] if isinstance(postal_code, pd.Series): list_poscal_codes = postal_code.unique() diff --git a/src/rama/processing/study_graphs.py b/src/rama/processing/study_graphs.py index 5da22da..e512b0d 100644 --- a/src/rama/processing/study_graphs.py +++ b/src/rama/processing/study_graphs.py @@ -1,12 +1,14 @@ -from collections import deque -import numpy as np -import networkx as nx import datetime as dt +from typing import Any, Sequence + +import networkx as nx +import numpy as np -############ Paths ############ +# Paths -def find_all_paths(graph: nx.DiGraph, x: int | str, path: deque | None = None) -> deque: + +def find_all_paths(graph: nx.DiGraph, x: int | str, path: list | None = None) -> list: """function to find all paths in a graph starting in a given node""" if path is None: path = [] @@ -22,14 +24,14 @@ def find_all_paths(graph: nx.DiGraph, x: int | str, path: deque | None = None) - return paths -def keep_longest_path(list_paths: deque) -> deque: +def keep_longest_path(list_paths: list) -> list: """Function to keep only the longest paths from a list of paths""" max_ = max(len(x) for x in list_paths) list_ = [x for x in list_paths if len(x) == max_] return list_ -def get_max_length(graph: nx.DiGraph, list_nodes: deque) -> int: +def get_max_length(graph: nx.DiGraph, list_nodes: Sequence[str | int]) -> int: """Function to get the max length of a path in a graph""" lens = [] for node in list_nodes: @@ -38,12 +40,12 @@ def get_max_length(graph: nx.DiGraph, list_nodes: deque) -> int: return max_len -############ Dates ############ +# Dates def get_dates_with_nans( - graph: nx.DiGraph, list_nodes: deque, format_date: str = "%Y-%m-%d" -) -> deque: + graph: nx.DiGraph, list_nodes: Sequence[int | str], format_date: str = "%Y-%m-%d" +) -> list: """Function to get the dates from an array containing nans""" dates_str = np.array( [ @@ -62,7 +64,7 @@ def get_dates_with_nans( return dates_dt -def get_growingtime(dates: deque, period: float = 365.2425) -> float: +def get_growingtime(dates: Sequence[Any], period: float = 365.2425) -> float: """Function to get the growing time of a given array of dates.""" dates_without_nans = [date for date in dates if isinstance(date, dt.datetime)] if len(dates_without_nans) != 0: @@ -74,7 +76,7 @@ def get_growingtime(dates: deque, period: float = 365.2425) -> float: return life_years -def get_min_max_dates(dates: deque) -> tuple: +def get_min_max_dates(dates: Sequence[Any]) -> tuple: """Function to get the min and max dates from an array""" dates = [date for date in dates if isinstance(date, dt.datetime)] if len(dates) != 0: @@ -86,20 +88,20 @@ def get_min_max_dates(dates: deque) -> tuple: return min_, max_ -############ Branches ############ +# Branches def get_detail_branches( graph: nx.DiGraph, - branches: deque, - dates_of_creation_dt: deque, + branches: list, + dates_of_creation_dt: list, format_date: str = "%Y-%m-%d", period: float = 365.2425, ) -> dict: """Function to get the local details of branches into a dictionary""" detail_branches = {} for b, branch in enumerate(branches): - dict_local = {} + dict_local: dict[str, Any] = {} sprouts = [edge[1] for edge in graph.edges if branch == edge[0]] sprouts_dates_str = [graph.nodes[sprout]["date_of_creation"] for sprout in sprouts] @@ -133,10 +135,13 @@ def get_detail_branches( def get_dict_branches( - graph: nx.DiGraph, list_nodes: deque, period: float = 365.2425, format_date: str = "%Y-%m-%d" + graph: nx.DiGraph, + list_nodes: Sequence[int | str], + period: float = 365.2425, + format_date: str = "%Y-%m-%d", ) -> dict: """Function to get the dictionary of all branches""" - dict_branches = {} + dict_branches: dict[str, Any] = {} dates_subgraph = [graph.nodes[node]["date_of_creation"] for node in list_nodes] dates_subgraph_dt = [ dt.datetime.strptime(date, format_date) for date in dates_subgraph if isinstance(date, str) @@ -174,10 +179,10 @@ def get_dict_branches( return dict_branches -############### Obtain dictionary ############ +# Obtain dictionary -def get_dict_cluster(graph: nx.DiGraph, list_nodes: deque) -> dict: +def get_dict_cluster(graph: nx.DiGraph, list_nodes: Sequence[int | str]) -> dict: """Function to get the dictionary of a subgraph""" in_degrees = np.array([graph.nodes[node]["in_degree"] for node in list_nodes]) out_degrees = np.array([graph.nodes[node]["out_degree"] for node in list_nodes]) @@ -199,7 +204,7 @@ def get_dict_cluster(graph: nx.DiGraph, list_nodes: deque) -> dict: sic_codes = np.array([graph.nodes[node]["sic_codes"] for node in list_nodes]) dict_branches = get_dict_branches(graph, list_nodes) - dict_cluster = dict() + dict_cluster: dict[str, Any] = {} dict_cluster["number_of_nodes"] = number_of_nodes dict_cluster["number_of_roots"] = number_of_roots dict_cluster["number_of_branches"] = number_of_branches @@ -216,7 +221,7 @@ def get_dict_cluster(graph: nx.DiGraph, list_nodes: deque) -> dict: return dict_cluster -############ Classification ############ +# Classification def classify_cluster(dict_cluster: dict) -> dict: diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c16855b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,18 @@ +import pytest + +import pandas as pd + + +@pytest.fixture(scope="function") +def init_first_link(): + """Returns a simple pandas DataFrame like the one processed by Rama""" + first_link = pd.DataFrame( + data={ + "name": ["Alice", "Bob", "Charles", "Dave", "Emma", "Frances"], + "company_name": ["Ecila", "bob", "Selrahc", "Evade", "Ammerica", "Secnarfff"], + "company_number": ["1234", "5678", "9012", "3456", "7890"], + "idx_human": [1, 2, 3, 4, 5], + "idx_company": [11, 12, 13, 14, 15], + } + ) + return first_link diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 0000000..aaeaa80 --- /dev/null +++ b/tests/tests.py @@ -0,0 +1,42 @@ +import pandas as pd +from tqdm import tqdm +import pytest + + +def test_humans(init_first_link): + "Testing humans correctly indexed" + small_firstlink = init_first_link + list_nonpass = [] + for idx_h in tqdm(small_firstlink.idx_human.sort_values().unique()): + if len(small_firstlink.loc[small_firstlink.idx_human == idx_h].name.unique()) != 1: + list_nonpass.append(idx_h) + assert len(list_nonpass) == 0 + + +def test_companies_names(init_first_link): + "Testing Companies Names" + small_firstlink = init_first_link + list_nonpass = [] + + for idx_c in tqdm(small_firstlink.idx_company.sort_values().unique()): + if ( + len(small_firstlink.loc[small_firstlink.idx_company == idx_c].company_name.unique()) + != 1 + ): + list_nonpass.append(idx_c) + # assert len(list_nonpass)== 0 + return list_nonpass + + +def test_companies_number(init_first_link): + "Testing Companies Numbers" + small_firstlink = init_first_link + list_nonpass = [] + + for idx_c in tqdm(small_firstlink.idx_company.sort_values().unique()): + if ( + len(small_firstlink.loc[small_firstlink.idx_company == idx_c].company_number.unique()) + != 1 + ): + list_nonpass.append(idx_c) + return list_nonpass diff --git a/tests/tests_for_dfs.py b/tests/tests_for_dfs.py index d60e7d5..e054069 100644 --- a/tests/tests_for_dfs.py +++ b/tests/tests_for_dfs.py @@ -1,69 +1,60 @@ from tqdm import tqdm -import pandas as pd -def test_humans(small_firstlink): - list_nonpass = list() - print('Testing humans correctly indexed') - for idx_h in tqdm(small_firstlink.idx_human.sort_values().unique()): - if len(small_firstlink.loc[small_firstlink.idx_human==idx_h].name.unique()) != 1: - list_nonpass.append(idx_h) - assert len(list_nonpass) == 0 - -def test_companies_names(small_firstlink): - list_nonpass = list() - print('Testing Companies Names') - for idx_c in tqdm(small_firstlink.idx_company.sort_values().unique()): - if len(small_firstlink.loc[small_firstlink.idx_company==idx_c].company_name.unique()) != 1: - list_nonpass.append(idx_c) - #assert len(list_nonpass)== 0 - return list_nonpass - + def test_compare_companies_between_df(small_firstlink, companies): - list_nonpass = list() - print('Comparing companies between dataframes') - small_companies = companies.loc[companies.company_number.isin(small_firstlink.company_number.unique()), - ['company_name', 'company_number']] + "Comparing companies between dataframes" + list_nonpass = [] + small_companies = companies.loc[ + companies.company_number.isin(small_firstlink.company_number.unique()), + ["company_name", "company_number"], + ] for idx_c in tqdm(small_firstlink.idx_company.sort_values().unique()): - - df = small_firstlink.loc[small_firstlink.idx_company==idx_c, ['company_name', 'company_number']] + df = small_firstlink.loc[ + small_firstlink.idx_company == idx_c, ["company_name", "company_number"] + ] name_totry = df.company_name.values[0] number_totry = df.company_number.values[0] - - name_reference = small_companies.loc[small_companies.company_number==number_totry, 'company_name'].values[0] - + + name_reference = small_companies.loc[ + small_companies.company_number == number_totry, "company_name" + ].values[0] + if name_reference != name_totry: list_nonpass.append(idx_c) assert len(list_nonpass) == 0 -def test_companies_number(small_firstlink): - list_nonpass = list() - print('Testing Companies Numbers') - for idx_c in tqdm(small_firstlink.idx_company.sort_values().unique()): - if len(small_firstlink.loc[small_firstlink.idx_company==idx_c].company_number.unique()) != 1: - list_nonpass.append(idx_c) - return list_nonpass def test_companies_number2(small_firstlink): - list_nonpass = list() - print('Testing Companies Numbers') + "Testing Companies Numbers" + list_nonpass = [] for idx_c in tqdm(small_firstlink.idx_company_2.sort_values().unique()): - if len(small_firstlink.loc[small_firstlink.idx_company_2==idx_c].company_number.unique()) != 1: - list_nonpass.append(idx_c) + if ( + len(small_firstlink.loc[small_firstlink.idx_company_2 == idx_c].company_number.unique()) + != 1 + ): + list_nonpass.append(idx_c) return list_nonpass - + + def test_compare_companies_between_df2(small_firstlink, companies): - list_nonpass = list() - print('Comparing companies between dataframes') - small_companies = companies.loc[companies.company_number.isin(small_firstlink.company_number.unique()), - ['company_name', 'company_number']] + "Comparing companies between dataframes" + list_nonpass = [] + + small_companies = companies.loc[ + companies.company_number.isin(small_firstlink.company_number.unique()), + ["company_name", "company_number"], + ] for idx_c in tqdm(small_firstlink.idx_company_2.sort_values().unique()): - - df = small_firstlink.loc[small_firstlink.idx_company_2==idx_c, ['company_name', 'company_number']] + df = small_firstlink.loc[ + small_firstlink.idx_company_2 == idx_c, ["company_name", "company_number"] + ] name_totry = df.company_name.values[0] number_totry = df.company_number.values[0] - - name_reference = small_companies.loc[small_companies.company_number==number_totry, 'company_name'].values[0] - + + name_reference = small_companies.loc[ + small_companies.company_number == number_totry, "company_name" + ].values[0] + if name_reference != name_totry: list_nonpass.append(idx_c) - assert len(list_nonpass) == 0 \ No newline at end of file + assert len(list_nonpass) == 0