Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Maps filter #473

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions flathunter/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Wrap configuration options as an object"""
import os
from typing import Optional, Dict, Any
from typing import List, Optional, Dict, Any

import json
import yaml
Expand All @@ -18,7 +18,8 @@
from flathunter.crawler.wggesucht import WgGesucht
from flathunter.crawler.vrmimmo import VrmImmo
from flathunter.crawler.subito import Subito
from flathunter.filter import Filter
from flathunter.dataclasses import DistanceConfig
from flathunter.gmaps_duration_processor import TransportationModes
from flathunter.logging import logger
from flathunter.exceptions import ConfigException

Expand Down Expand Up @@ -172,12 +173,6 @@ def searchers(self):
"""Get the list of search plugins"""
return self.__searchers__

def get_filter(self):
"""Read the configured filter"""
builder = Filter.builder()
builder.read_config(self)
return builder.build()

def captcha_enabled(self):
"""Check if captcha is configured"""
return self._get_captcha_solver() is not None
Expand Down Expand Up @@ -354,6 +349,23 @@ def max_price_per_square(self):
"""Return the configured maximum price per square meter"""
return self._get_filter_config("max_price_per_square")

def max_distance(self) -> List[DistanceConfig] | None:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it would be more natural here just to return the empty list of there's nothing configured. Makes the typing judgement simpler.

"""Return the configured maximum distance to locations."""
config = self._get_filter_config("max_distance")
if config is None:
return None
out = []
for distance_filter_item in config:
out.append(
DistanceConfig(
location_name=distance_filter_item['location_name'],
transport_mode=TransportationModes(distance_filter_item['transportation_mode']),
max_distance_meters=distance_filter_item.get('max_distance_meters'),
max_duration_seconds=distance_filter_item.get('max_duration_seconds')
)
)
return out

def __repr__(self):
return json.dumps({
"captcha_enabled": self.captcha_enabled(),
Expand Down
63 changes: 63 additions & 0 deletions flathunter/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from dataclasses import dataclass
from enum import Enum
from typing import Optional


class TransportationModes(Enum):
"""The transportation mode for Google Maps distance calculation."""
TRANSIT = 'transit'
BICYCLING = 'bicycling'
DRIVING = 'driving'
WALKING = 'walking'


@dataclass
class DistanceValueTuple:
"""We want to keep both the numeric value of a distance, and its string representation."""
meters: float
text: str


@dataclass
class DurationValueTuple:
"""We want to keep both the numeric value of a duration, and its string representation."""
seconds: float
text: str


@dataclass
class DistanceElement:
"""Represents the distance from a property to some location."""
duration: DurationValueTuple
distance: DistanceValueTuple
mode: TransportationModes


@dataclass
class DistanceConfig:
"""Represents distance filter information in the configuration file.

location_name must refer to the location name used to identify the location
in the durations section of the config file, and the transport_mode must be
configured in the durations section for that location name, lest no information
is available to actually filter on."""
location_name: str
transport_mode: TransportationModes
max_distance_meters: Optional[float]
max_duration_seconds: Optional[float]


class FilterChainName(Enum):
"""Identifies the filter chain that a filter acts on

Preprocess filters will be run before the expose is processed by any further actions.
Use this chain to filter exposes that can be excluded based on information scraped
from the expose website alone (such as based on price or size).
Postprocess filters will be run after other actions have completed. Use this if you
require additional information from other steps, such as information from the Google
Maps API, to make a decision on this expose.

We separate the filter chains to avoid making expensive (literally!) calls to the
Google Maps API for exposes that we already know we aren't interested in anyway."""
preprocess = 'PREPROCESS'
postprocess = 'POSTPROCESS'
88 changes: 65 additions & 23 deletions flathunter/filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""Module with implementations of standard expose filters"""
from functools import reduce
import re
from abc import ABC, ABCMeta
from typing import List, Any
from typing import List, Any, Dict

from flathunter.config import DistanceConfig
from flathunter.dataclasses import FilterChainName
from flathunter.gmaps_duration_processor import DistanceElement
from flathunter.logging import logger


class AbstractFilter(ABC):
Expand Down Expand Up @@ -172,30 +176,65 @@ def is_interesting(self, expose):
return pps <= self.max_pps


class FilterBuilder:
class DistanceFilter(AbstractFilter):
"""Exclude properties based on distance or duration to a location

This must be in the post-processing filter chain, as it requires data
from the Google Maps API, which is not available right after scraping."""

distance_config: DistanceConfig

def __init__(self, distance_config: DistanceConfig):
self.distance_config = distance_config

def is_interesting(self, expose):
durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None)
if durations is None or self.distance_config.location_name not in durations:
logger.info('DurationFilter is enabled, but no GMaps data found. Skipping filter.')
return True
distance = durations[self.distance_config.location_name].distance.meters
duration = durations[self.distance_config.location_name].duration.seconds
out = True
if self.distance_config.max_distance_meters:
out &= distance < self.distance_config.max_distance_meters
if self.distance_config.max_duration_seconds:
out &= duration < self.distance_config.max_duration_seconds
return out


class FilterChainBuilder:
"""Construct a filter chain"""
filters: List[AbstractFilter]

def __init__(self):
self.filters = []

def _append_filter_if_not_empty(self, filter_class: ABCMeta, filter_config: Any):
def _append_filter_if_not_empty(
self,
filter_class: ABCMeta,
filter_config: Any):
"""Appends a filter to the list if its configuration is set"""
if not filter_config:
return
self.filters.append(filter_class(filter_config))

def read_config(self, config):
def read_config(self, config, filter_chain: FilterChainName):
"""Adds filters from a config dictionary"""
self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
if filter_chain == FilterChainName.preprocess:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit "meh" on if-else chains. Do you think we could do this with Enum and Match?

https://realpython.com/python-enum/#using-enumerations-in-if-and-match-statements

self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
elif filter_chain == FilterChainName.postprocess:
for df in config.max_distance():
self._append_filter_if_not_empty(DistanceFilter, df)
else:
raise NotImplementedError()
return self

def filter_already_seen(self, id_watch):
Expand All @@ -204,12 +243,12 @@ def filter_already_seen(self, id_watch):
return self

def build(self):
"""Return the compiled filter"""
return Filter(self.filters)
"""Return the compiled filter chain"""
return FilterChain(self.filters)


class Filter:
"""Abstract filter object"""
class FilterChain:
"""Collection of expose filters in use by a hunter instance"""

filters: List[AbstractFilter]

Expand All @@ -218,14 +257,17 @@ def __init__(self, filters: List[AbstractFilter]):

def is_interesting_expose(self, expose):
"""Apply all filters to this expose"""
return reduce((lambda x, y: x and y),
map((lambda x: x.is_interesting(expose)), self.filters), True)

for filter_ in self.filters:
if not filter_.is_interesting(expose):
return False
return True

def filter(self, exposes):
"""Apply all filters to every expose in the list"""
return filter(self.is_interesting_expose, exposes)

@staticmethod
def builder():
"""Return a new filter builder"""
return FilterBuilder()
"""Return a new filter chain builder"""
return FilterChainBuilder()

66 changes: 42 additions & 24 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,55 @@
import datetime
import time
from urllib.parse import quote_plus
from typing import Dict
import requests
from flathunter.dataclasses import DistanceElement, DistanceValueTuple, DurationValueTuple, TransportationModes

from flathunter.logging import logger
from flathunter.abstract_processor import Processor


class GMapsDurationProcessor(Processor):
"""Implementation of Processor class to calculate travel durations"""

GM_MODE_TRANSIT = 'transit'
GM_MODE_BICYCLE = 'bicycling'
GM_MODE_DRIVING = 'driving'

def __init__(self, config):
self.config = config

def process_expose(self, expose):
"""Calculate the durations for an expose"""
expose['durations'] = self.get_formatted_durations(expose['address']).strip()
durations = self.get_distances_and_durations(expose['address'])
expose['durations'] = self._format_durations(durations).strip()
expose['durations_unformatted'] = durations
return expose

def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]:
"""Return a dict mapping location names to distances and durations"""
out = {}
for duration in self.config.get('durations', []):
if 'destination' not in duration or 'name' not in duration or 'modes' not in duration:
logger.warning('illegal duration configuration: %s', duration)
continue
dest = duration.get('destination')
name = duration.get('name')
for mode in duration.get('modes', []):
if 'gm_id' in mode and 'title' in mode \
and 'key' in self.config.get('google_maps_api', {}):
duration = self._get_gmaps_distance(address, dest, mode['gm_id'])
out[name] = duration
return out

def get_formatted_durations(self, address):
"""Return a formatted list of GoogleMaps durations"""
out = ""
for duration in self.config.get('durations', []):
if 'destination' in duration and 'name' in duration:
dest = duration.get('destination')
name = duration.get('name')
for mode in duration.get('modes', []):
if 'gm_id' in mode and 'title' in mode \
and 'key' in self.config.get('google_maps_api', {}):
duration = self.get_gmaps_distance(address, dest, mode['gm_id'])
title = mode['title']
out += f"> {name} ({title}): {duration}\n"
durations = self.get_distances_and_durations(address)
return self._format_durations(durations)

def _format_durations(self, durations: Dict[str, DistanceElement]):
out = ""
for location_name, val in durations.items():
out += f"> {location_name} ({val.mode.value}): {val.duration.text} ({val.distance.text})\n"
return out.strip()

def get_gmaps_distance(self, address, dest, mode):
def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None:
"""Get the distance"""
# get timestamp for next monday at 9:00:00 o'clock
now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
Expand All @@ -54,11 +66,10 @@ def get_gmaps_distance(self, address, dest, mode):
base_url = self.config.get('google_maps_api', {}).get('url')
gm_key = self.config.get('google_maps_api', {}).get('key')

if not gm_key and mode != self.GM_MODE_DRIVING:
if not gm_key and mode != TransportationModes.DRIVING:
logger.warning("No Google Maps API key configured and without using a mode "
"different from 'driving' is not allowed. "
"Downgrading to mode 'drinving' thus. ")
mode = 'driving'
"different from 'driving' is not allowed. Thus downgrading to mode 'driving'.")
mode = TransportationModes.DRIVING
base_url = base_url.replace('&key={key}', '')

# retrieve the result
Expand All @@ -82,7 +93,14 @@ def get_gmaps_distance(self, address, dest, mode):
element['distance']['text'],
element['duration']['text'],
element['duration']['value'])
duration_text = element['duration']['text']
distance_text = element['distance']['text']
distances[element['duration']['value']] = f"{duration_text} ({distance_text})"
distance_element = DistanceElement(
duration=DurationValueTuple(
float(element['duration']['value']),
element['duration']['text']),
distance=DistanceValueTuple(
float(element['distance']['value']),
element['distance']['text']),
mode=TransportationModes(mode)
)
distances[distance_element.distance.meters] = distance_element
return distances[min(distances.keys())] if distances else None
Loading