Changes to gitlab manager fixing pagination

simranjeetc · Sep 20, 2024 · 48fea60 · 48fea60
1 parent 4e83579
commit 48fea60
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 30 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -2,9 +2,26 @@ FROM python:3.9-slim
 
 WORKDIR /app
 
+# Accept proxy settings as build arguments
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+
+# Set the proxy environment variables
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ENV no_proxy=${NO_PROXY}
+
 # Install git and clean up to reduce image size
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends git openssh-client && \
+    apt-get install -y --no-install-recommends \
+    git \
+    openssh-client \
+    curl \
+    inetutils-ping \
+    net-tools \
+    iproute2 \
+    dnsutils && \
     rm -rf /var/lib/apt/lists/*
 
 # Copy only necessary files

diff --git a/gitlab_manager.py b/gitlab_manager.py
@@ -1,49 +1,95 @@
 import requests
+import re
 from logger import log_info, log_error, log_debug, log_warning, log_critical, log_exception
-from urllib.parse import quote
 from config import GITLAB_TOKEN, GITLAB_API_URL
 
 class GitLabManager:
     def __init__(self, projects):
-        self.projects = [proj.strip() for proj in projects if proj.strip()]
+        self.allowed_projects = [project.strip() for project in projects if project.strip()]
         self.headers = {'PRIVATE-TOKEN': GITLAB_TOKEN}
 
     def get_repositories(self):
         repos = []
-        for project in self.projects:
-            encoded_project = quote(project, safe='')
-            page = 1
-            while True:
-                url = f"{GITLAB_API_URL}/groups/{encoded_project}/projects"
-                params = {'per_page': 100, 'page': page, 'include_subgroups': True}
-                try:
-                    response = requests.get(url, headers=self.headers, params=params, timeout=10, verify=False)
-                    if response.status_code != 200:
-                        log_error(f"Failed to fetch repos for project {project}: {response.status_code}")
-                        break
-                    data = response.json()
-                    if not data:
-                        break
-                    for repo in data:
-                        if not repo['archived']:
+        url = f"{GITLAB_API_URL}/projects"
+        params = {
+            'pagination': 'keyset',
+            'per_page': 100,
+            'order_by': 'id',
+            'sort': 'asc',
+            'include_subgroups': True
+        }
+
+        while True:
+            try:
+                # Send the request. Use params only for the first request, then use 'url' directly if it's updated.
+                response = requests.get(url, headers=self.headers, params=params if 'params' in locals() else None,
+                                        timeout=40, verify=False)
+                if response.status_code != 200:
+                    log_error(f"Failed to fetch repos for gitlab projects: {response.status_code}")
+                    break
+
+                # Parse the response data
+                data = response.json()
+                if not data:
+                    break
+
+                # Process the repositories
+                for repo in data:
+                    if not repo['archived']:
+                        if any(repo['path_with_namespace'].startswith(allowed_project) for allowed_project in
+                               self.allowed_projects):
                             repos.append({
                                 'name': repo['name'],
                                 'ssh_url_to_repo': repo['ssh_url_to_repo'],
                                 'http_url_to_repo': repo['http_url_to_repo'],
                                 'default_branch': repo['default_branch'],
-                                'name_with_namespace': repo['name_with_namespace']
+                                'name_with_namespace': repo['name_with_namespace'],
+                                'path_with_namespace': repo['path_with_namespace']
                             })
-                    page += 1
-                except requests.exceptions.Timeout:
-                    log_error(f"Request timed out while fetching repositories for project {project}")
-                    break
+                        else:
+                            log_info(
+                                f"Skipping repository {repo['path_with_namespace']} as it does not fall under the allowed projects.")
 
-                except requests.exceptions.SSLError as ssl_error:
-                    log_error(f"SSL error occurred while fetching repositories for project {project}: {ssl_error}")
-                    break
+                # Extract the next page URL from the 'Link' header
+                link_header = response.headers.get('Link', None)
+                if link_header and 'rel="next"' in link_header:
+                    url = self.extract_next_page_url(link_header)
+                    if not url:
+                        log_error("Failed to extract the next page URL, stopping pagination.")
+                        break
 
-                except requests.exceptions.RequestException as e:
-                    log_exception(f"An error occurred while fetching repositories for project {project}: {e}")
+                    # From this point on, don't use 'params' anymore
+                    params = None
+                else:
                     break
 
+            except requests.exceptions.Timeout:
+                log_error(f"Request timed out while fetching repositories for gitlab projects")
+                break
+
+            except requests.exceptions.SSLError as ssl_error:
+                log_error(f"SSL error occurred while fetching repositories for gitlab project: {ssl_error}")
+                break
+
+            except requests.exceptions.RequestException as e:
+                log_exception(f"An error occurred while fetching repositories for gitlab project: {e}")
+                break
+
         return repos
+
+    def extract_next_page_url(self, link_header):
+        """
+        Extracts the URL for the next page from the 'Link' header.
+        """
+        try:
+            # The 'Link' header contains URLs with rel="next", rel="prev", etc.
+            match = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
+            if match:
+                return match.group(1)  # Return the next page URL
+            else:
+                log_error("Next page URL not found in Link header.")
+                return None
+        except Exception as e:
+            log_error(f"Failed to extract next page URL from Link header: {e}")
+            return None
+
diff --git a/repo_cloner.py b/repo_cloner.py
@@ -34,10 +34,15 @@ def clone_or_update_repo(self, repo_info, base_dir):
                 log_error(f"Failed to update {repo_name}: {e}")
 
     def clone_gitlab_repo(self, repo_info, base_dir):
-        path_parts = repo_info['name_with_namespace'].split(' / ')
+        # Extract the namespace path from the repository info
+        repo_namespace_path = repo_info['path_with_namespace']
+
+        # Proceed with cloning
+        path_parts = repo_namespace_path.split('/')
         repo_name = path_parts[-1]
         repo_path = os.path.join(base_dir, *path_parts[1:-1], repo_name)
         clone_url = repo_info['ssh_url_to_repo'] if CLONE_METHOD == 'ssh' else repo_info['http_url_to_repo']
+
         if not os.path.exists(repo_path):
             os.makedirs(os.path.dirname(repo_path), exist_ok=True)
             try:
@@ -59,3 +64,4 @@ def clone_gitlab_repo(self, repo_info, base_dir):
                 self.clone_gitlab_repo(repo_info, base_dir)
             except GitCommandError as e:
                 log_error(f"Failed to update {repo_name}: {e}")
+