Skip to content

Commit

Permalink
Added a filter
Browse files Browse the repository at this point in the history
  • Loading branch information
JulienTD committed Feb 23, 2022
1 parent 3f1e9c9 commit ed3b5de
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 22 deletions.
15 changes: 10 additions & 5 deletions hashtheplanet/config/config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
"""
This module handles the config file.
"""
from enum import Enum
import json
from typing import Dict, List

from hashtheplanet.resources.git_resource import GitResource
from hashtheplanet.resources.npm_resource import NpmResource

class ConfigField(Enum):
TARGETS = "targets"
EXCLUDE_REGEX = "exclude_regex"

class Config():
"""
This class implements methods to manipulate the config file.
Expand All @@ -25,15 +30,15 @@ def parse(self, config_path: str):
with open(config_path, "r", encoding="utf-8") as file_fp:
self._config = json.load(file_fp)

def get_targets(self, resource_name: str) -> List[str]:
def get(self, resource_name: str, config_field: ConfigField):
"""
This methods returns the targets used by the given resource.
This methods returns a field content used by the given resource.
"""
module_info: Dict = self._config.get(resource_name)
field_content: Dict = self._config.get(resource_name)

if module_info is None:
if config_field is None:
return []
return module_info.get("targets")
return field_content.get(config_field.value)

def get_used_resources(self) -> List[str]:
"""
Expand Down
7 changes: 4 additions & 3 deletions hashtheplanet/core/hashtheplanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from sqlalchemy.orm import sessionmaker

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField
from hashtheplanet.executor.executor import Executor
from hashtheplanet.sql.db_connector import Base, DbConnector, Hash

Expand Down Expand Up @@ -90,10 +90,11 @@ def compute_hashs(self):
self._config.parse(self._input_file)

for resource_name in self._config.get_used_resources():
targets = self._config.get_targets(resource_name)
targets = self._config.get(resource_name, ConfigField.TARGETS)
exclude_regex = self._config.get(resource_name, ConfigField.EXCLUDE_REGEX)

for target in targets:
self._executor.execute(resource_name, target)
self._executor.execute(resource_name, target, exclude_regex)

logger.info("Computing done")

Expand Down
5 changes: 3 additions & 2 deletions hashtheplanet/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module handles the resource executions.
"""
from importlib import import_module
from typing import Optional

from loguru import logger

Expand All @@ -17,7 +18,7 @@ def __init__(self, database: DbConnector, session_scope):
self._database = database
self._session_scope = session_scope

def execute(self, resource_name: str, target: str):
def execute(self, resource_name: str, target: str, exclude_regex: Optional[str] = None):
"""
This method executes a resource to compute hashes.
"""
Expand All @@ -31,4 +32,4 @@ def execute(self, resource_name: str, target: str):
return

resource_instance: Resource = getattr(module, resource_class_name)(self._database)
resource_instance.compute_hashes(self._session_scope, target)
resource_instance.compute_hashes(self._session_scope, target, exclude_regex)
16 changes: 11 additions & 5 deletions hashtheplanet/resources/git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import subprocess
import tempfile
from stat import S_ISDIR, S_ISREG
from typing import List, Tuple
from typing import List, Optional, Tuple

# third party imports
from git import GitCommandError, Repo
Expand Down Expand Up @@ -54,10 +54,11 @@ def get_all_files_from_commit(commit: Commit) -> List[Tuple[FilePath, BlobHash]]
file_list.append((blob.path, blob.hexsha))
return file_list

@staticmethod
def _hash_files(
self,
files: List[GitFileMetadata],
repo_dir_path: str
repo_dir_path: str,
exclude_regex: Optional[str]
) -> List[FileMetadata]:
"""
This method calculates the SHA256 hashes of input files.
Expand All @@ -69,6 +70,8 @@ def _hash_files(
os.chdir(repo_dir_path)

for (file_path, tag_name, blob_hash) in files:
if not self.can_save(exclude_regex, file_path):
continue
try:
# We need to use a subprocess and not the GitPython library
# because when we execute "git cat-file -p [blob]" with it, it always removes the \n from the last line.
Expand Down Expand Up @@ -192,17 +195,20 @@ def _filter_stored_tags(stored_versions: List[VersionTable], found_tags: List[Ta
result.append(found_tag)
return result

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method clones the repository from url, retrieves tags, compares each tags to retrieve only modified files,
computes their hashes and then stores the tags & files information in the database.
"""
print("Regex:")
print(exclude_regex)
technology = target.split('.git')[0].split('/')[-1]
tags: List[Tag] = []
files: List[GitFileMetadata] = []

with tempfile.TemporaryDirectory() as tmp_dir_name:
try:
logger.info(f"Cloning {target}")
repo = self.clone_repository(target, tmp_dir_name)
except GitCommandError as error:
logger.warning(f"Error while cloning repository on {target}: {error}")
Expand All @@ -225,7 +231,7 @@ def compute_hashes(self, session_scope, target: str):
files += self._get_diff_files(tags)

logger.info("Generating hashes ...")
files_info = self._hash_files(files, tmp_dir_name)
files_info = self._hash_files(files, tmp_dir_name, exclude_regex)

logger.info("Saving hashes ...")
self._save_hashes(session_scope, files_info, tags, technology)
4 changes: 2 additions & 2 deletions hashtheplanet/resources/npm_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#standard imports
import tarfile
import tempfile
from typing import Dict, List, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple
import requests

# third party imports
Expand Down Expand Up @@ -94,7 +94,7 @@ def _save_hashes(
self._database.insert_file(session, npm_module_name, file_path)
self._database.insert_or_update_hash(session, file_hash, npm_module_name, [version])

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method downloads all versions of an npm module and stores all the versions with their associated files
and hashes and stores them in the database.
Expand Down
13 changes: 12 additions & 1 deletion hashtheplanet/resources/resource.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""
This module contains the base class for the resources.
"""
import re
from typing import Optional

from hashtheplanet.sql.db_connector import DbConnector

class Resource(): # pylint: disable=too-few-public-methods
Expand All @@ -12,8 +15,16 @@ class Resource(): # pylint: disable=too-few-public-methods
def __init__(self, database: DbConnector):
self._database = database

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method computes all the versions and their associated files & hashes and stores them in the database.
"""
raise NotImplementedError()

def can_save(self, exclude_regex: str, file_path: str):
"""
This method permits to verify if the specified file can be saved in the database or not
"""
if not exclude_regex:
return True
return not re.search(exclude_regex, file_path)
3 changes: 2 additions & 1 deletion src/tech_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"https://github.com/drupal/drupal.git",
"https://github.com/magento/magento2.git",
"https://github.com/joomla/joomla-cms.git"
]
],
"exclude_regex": "\\.php$|^(tests\\/|test\\/)"
},
"npm": {
"targets": [
Expand Down
6 changes: 3 additions & 3 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from unittest.mock import MagicMock, mock_open, patch

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField

def get_mock_open(files: Dict[str, str]):
def open_mock(filename, *args, **kwargs):
Expand Down Expand Up @@ -61,9 +61,9 @@ def test_get_targets():
config = Config()

with patch.dict(config._config, {"git": {"targets": ["target1", "target2"]}}):
assert len(config.get_targets("git")) == 2
assert len(config.get_targets("git", ConfigField.TARGETS)) == 2

assert len(config.get_targets("npm")) == 0
assert len(config.get_targets("npm", ConfigField.TARGETS)) == 0

def test_get_used_resources():
config = Config()
Expand Down
7 changes: 7 additions & 0 deletions tests/resources/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,10 @@ def test_compute_hashes():
assert False
except NotImplementedError as error:
assert True

def test_can_save():
resource = Resource("test")

assert resource.can_save(".php", "test.php") is False
assert resource.can_save(".js", "test.php") is True
assert resource.can_save("^tests/^", "tests/foobar.js") is True

0 comments on commit ed3b5de

Please sign in to comment.