Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
ceroberoz committed Sep 14, 2024
2 parents d04efb3 + 921ef60 commit 0c4200a
Show file tree
Hide file tree
Showing 6 changed files with 284 additions and 112 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

## 🆕 Latest Updates

- Added Mekari job listings
- Implemented Playwright for improved JavaScript rendering and pagination handling
- Enhanced Kredivo spider with better data extraction
- Enhanced Karir spider with better data extraction
- Improved error handling and logging
- Optimized pagination logic for better performance
- Updated data processing pipeline for increased efficiency

## 📊 Overview

Expand Down
1 change: 1 addition & 0 deletions freya/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Expand Down
96 changes: 59 additions & 37 deletions freya/spiders/karir.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from freya.pipelines import calculate_job_age
from freya.utils import calculate_job_apply_end_date

# Set up the logger for this spider
logger = logging.getLogger(__name__)

class KarirSpiderJson(scrapy.Spider):
Expand All @@ -23,10 +24,12 @@ class KarirSpiderJson(scrapy.Spider):
]

def __init__(self, *args, **kwargs):
# Initialize the spider and set the current timestamp
super().__init__(*args, **kwargs)
self.timestamp = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def start_requests(self):
# Start the scraping process by sending the initial request
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.9',
Expand All @@ -39,7 +42,7 @@ def start_requests(self):
'dnt': '1',
'sec-gpc': '1'
}

payload = self.get_payload(0) # Start with offset 0

yield scrapy.Request(
Expand All @@ -51,6 +54,7 @@ def start_requests(self):
)

def get_payload(self, offset):
# Create the payload for the API request
return {
"keyword":"*",
"location_ids":[],
Expand Down Expand Up @@ -80,14 +84,24 @@ def parse(self, response):
total_opportunities = data['data']['total_opportunities']

for opportunity in opportunities:
yield self.parse_job(opportunity)
yield scrapy.Request(
f"https://karir.com/_next/data/3t_6puNZeaT81JcSVqzwu/opportunities/{opportunity['id']}.json?index={opportunity['id']}",
headers={
'User-Agent': random.choice(self.USER_AGENTS),
'Accept': '*/*',
'Referer': 'https://karir.com/search-lowongan?keyword=*'
},
callback=self.parse_job_details,
meta={'job': opportunity}
)

# Explosive pagination!
# Handle pagination
current_offset = json.loads(response.request.body)['offset']
next_offset = current_offset + self.LIMIT

if next_offset < total_opportunities:
logger.info(f"💥 EXPLOSION! Fetching next page. Current progress: {next_offset}/{total_opportunities} 💥")
# Log progress: show how many jobs have been processed out of the total
logger.info(f"Progress update: Processed {next_offset} out of {total_opportunities} jobs")
yield scrapy.Request(
self.BASE_URL,
method='POST',
Expand All @@ -96,51 +110,58 @@ def parse(self, response):
callback=self.parse
)
else:
logger.info("🎆 ULTIMATE EXPLOSION! All job opportunities have been obliterated... I mean, scraped! 🎆")
# Log completion: all jobs have been processed
logger.info(f"Finished processing all {total_opportunities} jobs")

except json.JSONDecodeError as e:
logger.error(f"💔 Error decoding JSON: {e}")
logger.debug(f"Response content: {response.text}")
# Log error: couldn't understand the JSON response
logger.error(f"Couldn't read the JSON response: {e}")
logger.debug(f"The response we couldn't understand: {response.text}")
except Exception as e:
logger.error(f"💥 Unexpected error: {e}")
# Log error: something unexpected went wrong
logger.error(f"An unexpected problem occurred: {e}")

def parse_job(self, job: Dict[str, Any]) -> Dict[str, Any]:
first_seen = datetime.strptime(self.timestamp, "%d/%m/%Y %H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")
last_seen = self.format_datetime(job['posted_at'])

return {
'job_title': self.sanitize_string(job['job_position']),
'job_location': self.sanitize_string(job['description']),
'job_department': 'N/A', # Not provided in the response
'job_url': f"https://karir.com/opportunities/{job['id']}", # Assuming this is the correct URL format
'first_seen': first_seen,
'base_salary': self.get_salary_info(job),
'job_type': 'N/A', # Not provided in the response
'job_level': 'N/A', # Not provided in the response
'job_apply_end_date': calculate_job_apply_end_date(last_seen),
'last_seen': last_seen,
'is_active': 'True',
'company': self.sanitize_string(job['company_name']),
'company_url': f"https://karir.com/companies/{job['company_id']}", # Assuming this is the correct URL format
'job_board': 'Karir.com',
'job_board_url': 'https://karir.com/',
'job_age': calculate_job_age(first_seen, last_seen),
'work_arrangement': 'N/A', # Not provided in the response

# Optional fields
# 'is_urgent': str(job['is_urgent']),
}
def parse_job_details(self, response):
try:
job = response.meta['job']
job_detail = json.loads(response.text)['pageProps']['responseData']

first_seen = self.timestamp
last_seen = self.format_datetime(job['posted_at'])

yield {
'job_title': self.sanitize_string(job['job_position']),
'job_location': self.sanitize_string(job_detail['location']),
'job_department': ' - '.join(job_detail['job_functions']),
'job_url': f"https://karir.com/opportunities/{job['id']}",
'first_seen': first_seen,
'base_salary': self.get_salary_info(job_detail),
'job_type': job_detail['job_type'],
'job_level': ' - '.join(job_detail['job_levels']),
'job_apply_end_date': self.format_datetime(job_detail['expires_at']),
'last_seen': last_seen,
'is_active': str(not job_detail['is_expired']),
'company': self.sanitize_string(job_detail['company_name']),
'company_url': f"https://karir.com/companies/{job_detail['company']['id']}",
'job_board': 'Karir.com',
'job_board_url': 'https://karir.com/',
'job_age': calculate_job_age(first_seen, last_seen),
'work_arrangement': job_detail['workplace'],
}
except Exception as e:
logger.error(f"Error processing job details: {e}")

@staticmethod
def sanitize_string(s: Optional[str]) -> str:
# Clean up a string by removing unwanted characters
if s is None:
return 'N/A'
# Strip whitespace, replace commas with hyphens, and handle empty strings
sanitized = s.strip().replace(',', ' -')
return sanitized if sanitized else 'N/A'

@staticmethod
def format_datetime(date_string: str) -> str:
# Convert a date string to a standard format
try:
dt = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
return dt.strftime("%Y-%m-%d %H:%M:%S")
Expand All @@ -153,9 +174,10 @@ def format_datetime(date_string: str) -> str:

@staticmethod
def get_salary_info(job: Dict[str, Any]) -> str:
# Extract salary information from the job data
if job['salary_lower'] and job['salary_upper']:
return f"{job['salary_lower']} - {job['salary_upper']}"
elif job['salary_info'] and job['salary_info'] != 'LABEL_COMPETITIVE_SALARY':
return job['salary_info']
else:
return 'N/A'
return 'N/A'
2 changes: 1 addition & 1 deletion pipeline/scrape.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ if $found_files; then
echo "Merged CSV file updated: $MERGED_FILE"
else
echo "No output files found to merge."
fi
fi
4 changes: 4 additions & 0 deletions pipeline/sheet_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv

import sys
from pathlib import Path

# Load environment variables from .env file
load_dotenv()

# Add the project root directory to the Python path
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
Expand Down
Loading

0 comments on commit 0c4200a

Please sign in to comment.