Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix get_jobs method in Comparison Monitor #444

Merged
merged 3 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,26 +557,30 @@ def run(self, result):
def _get_jobs(self, states, number_of_jobs):
tags = self._get_tags_to_filter()

jobs = []
total_jobs = []
start = 0
client = Client(self.crawler.settings)
MAX_API_COUNT = 1000

_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
filters=dict(has_tag=tags) if tags else None,
)
while _jobs:
jobs.extend(_jobs)
start += 1000
_jobs = client.spider.jobs.list(
while True:
# API has a 1000 results limit
count = min(number_of_jobs - len(total_jobs), MAX_API_COUNT)
current_jobs = client.spider.jobs.list(
start=start,
state=states,
count=number_of_jobs,
count=count,
filters=dict(has_tag=tags) if tags else None,
Copy link
Member

@curita curita May 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (non-blocking): This was already like this, so we might need to update it in another PR, but filters=dict(has_tag=tags) if tags else None doesn't do anything, at least in the version of scrapinghub I'm using (2.4.0). has_tag=tags as a parameter by itself, without the filters wrap, does filter the jobs, though I'm not sure if that's backward compatible.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any other issues about this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any other issues about this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No that I know of, no! And that issue is not related to these changes. We should be good to merge 👍

)
return jobs
total_jobs.extend(current_jobs)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) == number_of_jobs:
# Stop paginating if results are less than 1000 (pagination not required)
# or target jobs was reached - no more pagination required
break

start += len(current_jobs)

return total_jobs

def _get_tags_to_filter(self):
"""
Expand Down
77 changes: 64 additions & 13 deletions tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest.mock import Mock, call
import math
from unittest.mock import Mock, call, patch

import pytest
from spidermon import MonitorSuite
Expand All @@ -10,6 +11,7 @@
ZyteJobsComparisonMonitor,
monitors,
)
from spidermon.core.factories import MonitorFactory
from spidermon.exceptions import NotConfigured


Expand All @@ -24,21 +26,16 @@ def mock_suite(mock_jobs, monkeypatch):
return MonitorSuite(monitors=[ZyteJobsComparisonMonitor])


def get_paginated_jobs(**kwargs):
return [Mock() for _ in range(kwargs["count"])]


@pytest.fixture
def mock_suite_and_zyte_client(
monkeypatch,
number_of_jobs,
):
def get_paginated_jobs(**kwargs):
start = kwargs["start"]
if start < number_of_jobs:
return [
Mock() for _ in range(start, max(number_of_jobs - 1000, number_of_jobs))
]
return []

monkeypatch.setenv("SHUB_JOB_DATA", '{"tags":["tag1","tag2","tag3"]}')

mock_client = Mock()
mock_client.spider.jobs.list.side_effect = get_paginated_jobs

Expand Down Expand Up @@ -88,6 +85,58 @@ def test_jobs_comparison_monitor_is_enabled(
runner.run(mock_suite, **data)


class TestZyteJobsComparisonMonitor(ZyteJobsComparisonMonitor):
def runTest():
pass


def test_jobs_comparison_monitor_get_jobs():
mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
output = [Mock(), Mock()]
mock_client.spider.jobs.list = Mock(return_value=output)

# Return less jobs than expected
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert jobs == output
mock_client.spider.jobs.list.assert_called_once()

with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Jobs bigger than 1000
jobs = monitor._get_jobs(states=None, number_of_jobs=2500)
assert len(jobs) == 2500
assert mock_client.spider.jobs.list.call_count == 3


@pytest.mark.parametrize(
["item_count", "previous_counts", "threshold", "should_raise"],
[
Expand Down Expand Up @@ -144,12 +193,14 @@ def test_arguments_passed_to_zyte_client(

calls = [
call.mock_client.spider.jobs.list(
start=n,
start=n * 1000,
state=list(states),
count=number_of_jobs,
# Count goes from pending number of jobs up to 1000
count=min(number_of_jobs - n * 1000, 1000),
filters={"has_tag": list(tags)},
)
for n in range(0, number_of_jobs + 1000, 1000)
# One call to api every 1000 expected jobs
for n in range(0, math.ceil(number_of_jobs / 1000))
]

mock_client.spider.jobs.list.assert_has_calls(calls)
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ skip_missing_interpreters = True

[testenv]
deps =
pytest
pytest==8.1.1
pytest-cov
pytest-mock
extras =
Expand Down
Loading