diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1224f3d3..40b8872b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,6 +38,7 @@ jobs: runs-on: ubuntu-latest container: image: ${{ matrix.ckan-image }} + options: --user root services: solr: image: ckan/ckan-solr:${{ matrix.solr-image }} @@ -63,7 +64,8 @@ jobs: - name: Install dependencies (common) run: | - DEBIAN_FRONTEND=noninteractive apt-get --assume-yes --quiet install \ + DEBIAN_FRONTEND=noninteractive apt-get update && \ + apt-get --assume-yes --quiet install \ python3-dev \ libxml2-dev \ libxslt1-dev \ @@ -72,7 +74,7 @@ jobs: - name: Install dependencies from requirements.txt run: | pip install -r requirements.txt - pip install pytest-ckan + pip install -r dev-requirements.txt - name: Install harvester run: | diff --git a/ckanext/spatial/harvesters/waf.py b/ckanext/spatial/harvesters/waf.py index b49f59ed..ff3a87ed 100644 --- a/ckanext/spatial/harvesters/waf.py +++ b/ckanext/spatial/harvesters/waf.py @@ -312,6 +312,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0): if 'mailto:' in url: continue if '..' not in url and url[-1] == '/': + if scraper == 'apache' and url[0] == '/': + continue new_depth = depth + 1 if depth > 10: log.info('Max WAF depth reached') diff --git a/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html b/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html new file mode 100644 index 00000000..f9ae82bc --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html @@ -0,0 +1,13 @@ + + + + + Index of /apache-folder + + +

Index of /apache-folder

+
      Name                    Last modified      Size  Description
Parent Directory - + record-1.xml 2024-11-07 15:00 356K + subfolder/ 2024-11-12 15:00 - +
+ diff --git a/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html new file mode 100644 index 00000000..494157a4 --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html @@ -0,0 +1,12 @@ + + + + + Index of /apache-folder/subfolder + + +

Index of /apache-folder/subfolder

+
      Name                    Last modified      Size  Description
Parent Directory - + record-2.xml 2024-11-07 16:59 182K +
+ diff --git a/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html b/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html new file mode 100644 index 00000000..2da312ab --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html @@ -0,0 +1,5 @@ +iis.server - /iis-folder/

iis.server - /iis-folder/


+ +
[To Parent Directory]

11/7/2024 7:20 AM <dir> subfolder
11/7/2024 3:00 PM 168 record-1.xml

+ + diff --git a/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html new file mode 100644 index 00000000..d89ec3a9 --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html @@ -0,0 +1,3 @@ +iis.server - /iis-folder/subfolder/

iis.server - /iis-folder/subfolder/


+ +
[To Parent Directory]

11/7/2024 4:59 PM 8958 record-2.xml

\ No newline at end of file diff --git a/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html b/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html new file mode 100644 index 00000000..e53fef32 --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html @@ -0,0 +1,9 @@ + + +Index of /nginx/ + +

Index of /nginx/


../
+subfolder/                                               07-Nov-2024 15:00                   -
+record-1.xml                                       07-Nov-2024 15:00              364868
+

+ diff --git a/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html new file mode 100644 index 00000000..68472e9b --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html @@ -0,0 +1,8 @@ + + +Index of /nginx/subfoler/ + +

Index of /nginx/subfolder/


../
+record-2.xml                                       07-Nov-2024 16:59              186150
+

+ diff --git a/ckanext/spatial/tests/waf_extract/test_waf_scraper.py b/ckanext/spatial/tests/waf_extract/test_waf_scraper.py new file mode 100644 index 00000000..ab6e1878 --- /dev/null +++ b/ckanext/spatial/tests/waf_extract/test_waf_scraper.py @@ -0,0 +1,53 @@ +import os + +from ckanext.spatial.harvesters.waf import _extract_waf + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +HTML_DIR = os.path.join(TEST_DIR, "html_files") + +def test_extract_iis(httpserver): + + # feed http response with these static html content + with \ + open(f"{HTML_DIR}/iis-folder.html", "r") as iis_folder, \ + open(f"{HTML_DIR}/nginx-folder.html", "r") as nginx_folder, \ + open(f"{HTML_DIR}/apache-folder.html", "r") as apache_folder, \ + open(f"{HTML_DIR}/iis-subfolder.html", "r") as iis_subfolder, \ + open(f"{HTML_DIR}/nginx-subfolder.html", "r") as nginx_subfolder, \ + open(f"{HTML_DIR}/apache-subfolder.html", "r") as apache_subfolder: + iis_folder_content = iis_folder.read() + nginx_folder_content = nginx_folder.read() + apache_folder_content = apache_folder.read() + iis_subfolder_content = iis_subfolder.read() + nginx_subfolder_content = nginx_subfolder.read() + apache_subfolder_content = apache_subfolder.read() + + # feed static content when it traverses the subfolder + httpserver.expect_request("/iis-folder/subfolder/").respond_with_data(iis_subfolder_content) + httpserver.expect_request("/nginx-folder/subfolder/").respond_with_data(nginx_subfolder_content) + httpserver.expect_request("/apache-folder/subfolder/").respond_with_data(apache_subfolder_content) + + # let it scape, traverse and extract the content + iis_results = _extract_waf( + iis_folder_content, + httpserver.url_for("/iis-folder/"), + "iis" + ) + + nginx_results = _extract_waf( + nginx_folder_content, + httpserver.url_for("/nginx-folder/"), + "nginx" + ) + + apache_results = _extract_waf( + apache_folder_content, + httpserver.url_for("/apache-folder/"), + "apache" + ) + + records_expected = [('record-1.xml', '2024-11-07 15:00:00'), ('record-2.xml', '2024-11-07 16:59:00')] + + assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in iis_results]) + assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in nginx_results]) + assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in apache_results]) diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 00000000..4f63fcc3 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,3 @@ +pytest-ckan +pytest-httpserver == 1.0.2; python_version < '3.10' +pytest-httpserver; python_version >= '3.10'