Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kitsune] Fix resuming feature #80

Merged
merged 1 commit into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 53 additions & 77 deletions perceval/backends/mozilla/kitsune.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,21 @@

import requests

from grimoirelab_toolkit.datetime import str_to_datetime
from grimoirelab_toolkit.datetime import str_to_datetime, datetime_to_utc
from grimoirelab_toolkit.uris import urijoin

from ...backend import (Backend,
BackendCommand,
BackendCommandArgumentParser)
from ...client import HttpClient
from ...errors import ParseError, RateLimitError, HttpClientError
from ...utils import DEFAULT_DATETIME


logger = logging.getLogger(__name__)


KITSUNE_URL = "https://support.mozilla.org"
DEFAULT_OFFSET = 0

CATEGORY_QUESTION = "question"

Expand All @@ -61,12 +63,13 @@ class Kitsune(Backend):
:param archive: archive to store/retrieve items
:param ssl_verify: enable/disable SSL verification
"""
version = '1.0.0'
version = '2.0.0'

CATEGORIES = [CATEGORY_QUESTION]

def __init__(self, url=None, tag=None, archive=None, ssl_verify=True, sleep_for_rate=False,
sleep_time=DEFAULT_SLEEP_TIME, max_retries=MAX_RETRIES):
def __init__(self, url=None, tag=None, archive=None, ssl_verify=True,
sleep_for_rate=False, sleep_time=DEFAULT_SLEEP_TIME,
max_retries=MAX_RETRIES):
if not url:
url = KITSUNE_URL
origin = url
Expand All @@ -79,17 +82,17 @@ def __init__(self, url=None, tag=None, archive=None, ssl_verify=True, sleep_for_

self.client = None

def fetch(self, category=CATEGORY_QUESTION, offset=DEFAULT_OFFSET):
def fetch(self, category=CATEGORY_QUESTION, from_date=DEFAULT_DATETIME):
"""Fetch questions from the Kitsune url.

:param category: the category of items to fetch
:offset: obtain questions after offset
:returns: a generator of questions
"""
if not offset:
offset = DEFAULT_OFFSET
if not from_date:
from_date = DEFAULT_DATETIME

kwargs = {"offset": offset}
kwargs = {"from_date": from_date}
items = super().fetch(category, **kwargs)

return items
Expand All @@ -102,87 +105,37 @@ def fetch_items(self, category, **kwargs):

:returns: a generator of items
"""
offset = kwargs['offset']
from_date = kwargs['from_date']

logger.info("Looking for questions at url '%s' using offset %s",
self.url, str(offset))
logger.info("Looking for questions at url '%s' updated after %s",
self.url, str(from_date))

nquestions = 0 # number of questions processed
tquestions = 0 # number of questions from API data
equestions = 0 # number of questions dropped by errors

# Always get complete pages so the first item is always
# the first one in the page
page = int(offset / KitsuneClient.ITEMS_PER_PAGE)
page_offset = page * KitsuneClient.ITEMS_PER_PAGE
# drop questions from page before the offset
drop_questions = offset - page_offset
current_offset = offset

questions_page = self.client.get_questions(offset)

while True:
try:
raw_questions = next(questions_page)
except StopIteration:
break
except requests.exceptions.HTTPError as e:
# Continue with the next page if it is a 500 error
if e.response.status_code == 500:
logger.exception(e)
logger.error("Problem getting Kitsune questions. "
"Loosing %i questions. Going to the next page.",
KitsuneClient.ITEMS_PER_PAGE)
equestions += KitsuneClient.ITEMS_PER_PAGE
current_offset += KitsuneClient.ITEMS_PER_PAGE
questions_page = self.client.get_questions(current_offset)
continue
else:
# If it is another error just propagate the exception
raise e

for questions_page in self.client.get_questions(from_date):
try:
questions_data = json.loads(raw_questions)
questions_data = json.loads(questions_page)
tquestions = questions_data['count']
questions = questions_data['results']
except (ValueError, KeyError) as ex:
logger.error(ex)
cause = ("Bad JSON format for mozilla_questions: %s" % (raw_questions))
cause = "Bad JSON format for mozilla_questions: %s" % (questions_page)
raise ParseError(cause=cause)

for question in questions:
if drop_questions > 0:
# Remove extra questions due to page base retrieval
drop_questions -= 1
continue
question['offset'] = current_offset
current_offset += 1
question['answers_data'] = []
for raw_answers in self.client.get_question_answers(question['id']):
answers = json.loads(raw_answers)['results']
question['answers_data'] += answers
yield question
nquestions += 1

logger.debug("Questions: %i/%i", nquestions + offset, tquestions)
equestions = tquestions - nquestions

logger.info("Total number of questions: %i (%i total)", nquestions, tquestions)
logger.info("Questions with errors dropped: %i", equestions)

def metadata(self, item, filter_classified=False):
"""Kitsune metadata.

This method takes items overrides `metadata` method to add extra
information related to Kitsune (offset of the question).

:param item: an item fetched by a backend
:param filter_classified: sets if classified fields were filtered
"""
item = super().metadata(item, filter_classified=filter_classified)
item['offset'] = item['data'].pop('offset')

return item

@classmethod
def has_archiving(cls):
"""Returns whether it supports archiving items on the fetch process.
Expand All @@ -209,15 +162,21 @@ def metadata_id(item):
def metadata_updated_on(item):
"""Extracts the update time from a Kitsune item.

The timestamp is extracted from 'updated' field.
This date is a UNIX timestamp but needs to be converted to
a float value.
The timestamp is the maximum 'updated' field from the question
and the answers. This date is a UNIX timestamp but needs to be
converted to a float value.

:param item: item generated by the backend

:returns: a UNIX timestamp
"""
return float(str_to_datetime(item['updated']).timestamp())
max_updated_on = float(str_to_datetime(item['updated']).timestamp())

for answer in item['answers_data']:
answer_updated_on = float(str_to_datetime(answer['updated']).timestamp())
max_updated_on = max(max_updated_on, answer_updated_on)

return max_updated_on

@staticmethod
def metadata_category(item):
Expand Down Expand Up @@ -260,25 +219,42 @@ def __init__(self, url, ssl_verify=True, sleep_for_rate=False,
self.sleep_time = sleep_time
self.max_retries = max_retries

def get_questions(self, offset=None):
def get_questions(self, from_date):
"""Retrieve questions from older to newer updated starting offset"""

page = KitsuneClient.FIRST_PAGE

if offset:
page += int(offset / KitsuneClient.ITEMS_PER_PAGE)
from_date = datetime_to_utc(from_date)
failures = 0

while True:
api_questions_url = urijoin(self.base_url, '/question') + '/'

params = {
"page": page,
"ordering": "updated"
"ordering": "updated",
"updated__gt": from_date.isoformat()
}

questions = self.fetch(api_questions_url, params)
yield questions
try:
questions = self.fetch(api_questions_url, params)
yield questions
except requests.exceptions.HTTPError as e:
if e.response.status_code == 500:
if failures >= self.max_retries:
raise e
logger.exception(e)
logger.error("Problem getting Kitsune questions. "
"Loosing %i questions. Going to the next page.",
KitsuneClient.ITEMS_PER_PAGE)
page += 1
failures += 1
continue
else:
# If it is another error just propagate the exception
raise e

failures = 0
questions_json = json.loads(questions)
next_uri = questions_json['next']
if not next_uri:
Expand Down Expand Up @@ -348,7 +324,7 @@ def setup_cmd_parser(cls):
"""Returns the Kitsune argument parser."""

parser = BackendCommandArgumentParser(cls.BACKEND,
offset=True,
from_date=True,
ssl_verify=True)

# Required arguments
Expand Down
11 changes: 11 additions & 0 deletions releases/unreleased/kitsune-resuming-feature-fixed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
title: Kitsune resuming feature fixed
category: fixed
author: Santiago Dueñas <[email protected]>
issue: 79
notes: >
Kitsune backend resuming feature was broken.
It didn't retrieve the updated questions right
using the parameter 'offset'.
Changing the backend to use 'from-date' parameter
instead has fixed the problem.
Loading