diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1224f3d3..40b8872b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -38,6 +38,7 @@ jobs:
runs-on: ubuntu-latest
container:
image: ${{ matrix.ckan-image }}
+ options: --user root
services:
solr:
image: ckan/ckan-solr:${{ matrix.solr-image }}
@@ -63,7 +64,8 @@ jobs:
- name: Install dependencies (common)
run: |
- DEBIAN_FRONTEND=noninteractive apt-get --assume-yes --quiet install \
+ DEBIAN_FRONTEND=noninteractive apt-get update && \
+ apt-get --assume-yes --quiet install \
python3-dev \
libxml2-dev \
libxslt1-dev \
@@ -72,7 +74,7 @@ jobs:
- name: Install dependencies from requirements.txt
run: |
pip install -r requirements.txt
- pip install pytest-ckan
+ pip install -r dev-requirements.txt
- name: Install harvester
run: |
diff --git a/ckanext/spatial/harvesters/waf.py b/ckanext/spatial/harvesters/waf.py
index b49f59ed..ff3a87ed 100644
--- a/ckanext/spatial/harvesters/waf.py
+++ b/ckanext/spatial/harvesters/waf.py
@@ -312,6 +312,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
if 'mailto:' in url:
continue
if '..' not in url and url[-1] == '/':
+ if scraper == 'apache' and url[0] == '/':
+ continue
new_depth = depth + 1
if depth > 10:
log.info('Max WAF depth reached')
diff --git a/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html b/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html
new file mode 100644
index 00000000..f9ae82bc
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/apache-folder.html
@@ -0,0 +1,13 @@
+
+
+
+
+ Index of /apache-folder
+
+
+Index of /apache-folder
+ Name Last modified Size Description
Parent Directory -
+ record-1.xml 2024-11-07 15:00 356K
+ subfolder/ 2024-11-12 15:00 -
+
+
diff --git a/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html
new file mode 100644
index 00000000..494157a4
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html
@@ -0,0 +1,12 @@
+
+
+
+
+ Index of /apache-folder/subfolder
+
+
+Index of /apache-folder/subfolder
+ Name Last modified Size Description
Parent Directory -
+ record-2.xml 2024-11-07 16:59 182K
+
+
diff --git a/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html b/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html
new file mode 100644
index 00000000..2da312ab
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/iis-folder.html
@@ -0,0 +1,5 @@
+iis.server - /iis-folder/iis.server - /iis-folder/
+
+ [To Parent Directory]
11/7/2024 7:20 AM <dir> subfolder
11/7/2024 3:00 PM 168 record-1.xml
+
+
diff --git a/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html
new file mode 100644
index 00000000..d89ec3a9
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/iis-subfolder.html
@@ -0,0 +1,3 @@
+iis.server - /iis-folder/subfolder/iis.server - /iis-folder/subfolder/
+
+ [To Parent Directory]
11/7/2024 4:59 PM 8958 record-2.xml
\ No newline at end of file
diff --git a/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html b/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html
new file mode 100644
index 00000000..e53fef32
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/nginx-folder.html
@@ -0,0 +1,9 @@
+
+
+Index of /nginx/
+
+Index of /nginx/
../
+subfolder/ 07-Nov-2024 15:00 -
+record-1.xml 07-Nov-2024 15:00 364868
+
+
diff --git a/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html b/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html
new file mode 100644
index 00000000..68472e9b
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/html_files/nginx-subfolder.html
@@ -0,0 +1,8 @@
+
+
+Index of /nginx/subfoler/
+
+Index of /nginx/subfolder/
../
+record-2.xml 07-Nov-2024 16:59 186150
+
+
diff --git a/ckanext/spatial/tests/waf_extract/test_waf_scraper.py b/ckanext/spatial/tests/waf_extract/test_waf_scraper.py
new file mode 100644
index 00000000..ab6e1878
--- /dev/null
+++ b/ckanext/spatial/tests/waf_extract/test_waf_scraper.py
@@ -0,0 +1,53 @@
+import os
+
+from ckanext.spatial.harvesters.waf import _extract_waf
+
+TEST_DIR = os.path.dirname(os.path.abspath(__file__))
+HTML_DIR = os.path.join(TEST_DIR, "html_files")
+
+def test_extract_iis(httpserver):
+
+ # feed http response with these static html content
+ with \
+ open(f"{HTML_DIR}/iis-folder.html", "r") as iis_folder, \
+ open(f"{HTML_DIR}/nginx-folder.html", "r") as nginx_folder, \
+ open(f"{HTML_DIR}/apache-folder.html", "r") as apache_folder, \
+ open(f"{HTML_DIR}/iis-subfolder.html", "r") as iis_subfolder, \
+ open(f"{HTML_DIR}/nginx-subfolder.html", "r") as nginx_subfolder, \
+ open(f"{HTML_DIR}/apache-subfolder.html", "r") as apache_subfolder:
+ iis_folder_content = iis_folder.read()
+ nginx_folder_content = nginx_folder.read()
+ apache_folder_content = apache_folder.read()
+ iis_subfolder_content = iis_subfolder.read()
+ nginx_subfolder_content = nginx_subfolder.read()
+ apache_subfolder_content = apache_subfolder.read()
+
+ # feed static content when it traverses the subfolder
+ httpserver.expect_request("/iis-folder/subfolder/").respond_with_data(iis_subfolder_content)
+ httpserver.expect_request("/nginx-folder/subfolder/").respond_with_data(nginx_subfolder_content)
+ httpserver.expect_request("/apache-folder/subfolder/").respond_with_data(apache_subfolder_content)
+
+ # let it scape, traverse and extract the content
+ iis_results = _extract_waf(
+ iis_folder_content,
+ httpserver.url_for("/iis-folder/"),
+ "iis"
+ )
+
+ nginx_results = _extract_waf(
+ nginx_folder_content,
+ httpserver.url_for("/nginx-folder/"),
+ "nginx"
+ )
+
+ apache_results = _extract_waf(
+ apache_folder_content,
+ httpserver.url_for("/apache-folder/"),
+ "apache"
+ )
+
+ records_expected = [('record-1.xml', '2024-11-07 15:00:00'), ('record-2.xml', '2024-11-07 16:59:00')]
+
+ assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in iis_results])
+ assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in nginx_results])
+ assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in apache_results])
diff --git a/dev-requirements.txt b/dev-requirements.txt
new file mode 100644
index 00000000..4f63fcc3
--- /dev/null
+++ b/dev-requirements.txt
@@ -0,0 +1,3 @@
+pytest-ckan
+pytest-httpserver == 1.0.2; python_version < '3.10'
+pytest-httpserver; python_version >= '3.10'