Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for classifying invalid compatibility reports to the http service #3846

Merged
merged 1 commit into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 209 additions & 0 deletions http_service/bugbug_http/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from bugbug import bugzilla, get_bugbug_version, utils
from bugbug_http.models import (
MODELS_NAMES,
classify_broken_site_report,
classify_bug,
classify_issue,
get_config_specific_groups,
Expand Down Expand Up @@ -230,6 +231,29 @@ def create_bug_classification_jobs(
)


def create_broken_site_report_classification_jobs(
model_name: str, reports: list[dict]
) -> tuple[JobInfo, str, int]:
"""Create job_id and redis connection"""
job_id = get_job_id()

# Set the mapping before queuing to avoid some race conditions
job_id_mapping = {}
for report in reports:
key = JobInfo(
classify_broken_site_report, model_name, report["uuid"]
).mapping_key
job_id_mapping[key] = job_id

redis_conn.mset(job_id_mapping)

return (
JobInfo(classify_broken_site_report, model_name, reports),
job_id,
JOB_TIMEOUT,
)


def schedule_issue_classification(
model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
) -> None:
Expand Down Expand Up @@ -720,6 +744,191 @@ def batch_prediction(model_name):
return compress_response({"bugs": data}, status_code)


@application.route("/<model_name>/predict/broken_site_report/batch", methods=["POST"])
@cross_origin()
suhaibmujahid marked this conversation as resolved.
Show resolved Hide resolved
def batch_prediction_broken_site_report(model_name):
"""
---
post:
description: >
Post a batch of reports to classify, answer either 200 if all are are
processed or 202 if at least one report is not processed.
<br/><br/>
Starts by sending a batch of reports like this:<br/>
```
{"reports": [{"uuid": "954dbc23-91e6-4d6f-a10a-405f46663e31", "title: "https://example.com", "body": "Loads blank page."}]}
```<br/><br>

You will likely get a 202 answer that indicates that no result is
available yet for any of the reports id you provided with the following
body:<br/>

```
{"reports": {"<uuid 1>": {ready: False}, "<uuid 2>": {ready: False}}}
```<br/><br/>

Call back the same endpoint with the same uuids a bit it later, and you
will get the results.<br/><br/>

You might get the following output if some bugs are not available:
<br/>

```
{"reports": {"<uuid 1>": {"available": False}}}
```<br/><br/>

And you will get the following output once the bugs are available:
<br/>
```
{"reports": {"<uuid 1>": {"extra_data": {}, "index": 0, "prob": [0], "suggestion": ""}}}
```<br/><br/>

Please be aware that each report could be in a different state, so the
following output, where a report is returned and another one is still
being processed, is valid:
<br/>
```
{"reports": {"<uuid 1>": {"available": False}, "<uuid 2>": {"extra_data": {}, "index": 0, "prob": [0], "suggestion": ""}}}
```
summary: Classify a batch of reports
parameters:
- name: model_name
in: path
schema: ModelName
requestBody:
description: The list of reports to classify
content:
application/json:
schema:
type: object
properties:
reports:
type: array
items:
type: object
properties:
uuid:
type: string
title:
type: string
body:
type: string
examples:
cat:
summary: An example of payload
value:
reports:
- uuid: "954dbc23-91e6-4d6f-a10a-405f46663e31"
title: "https://example.com"
body: "Loads blank page."
responses:
200:
description: A list of results
content:
application/json:
schema:
type: object
additionalProperties: true
example:
reports:
<uuid 1>:
extra_data: {}
index: 0
prob: [0]
suggestion: string
<uuid 2>:
extra_data: {}
index: 0
prob: [0]
suggestion: string
202:
description: A temporary answer for reports being processed
content:
application/json:
schema:
type: object
items:
type: object
properties:
ready:
type: boolean
enum: [False]
example:
reports:
<uuid 1>:
extra_data: {}
index: 0
prob: [0]
suggestion: string
<uuid 2>: {ready: False}
401:
description: API key is missing
content:
application/json:
schema: UnauthorizedError
"""
headers = request.headers

auth = headers.get(API_TOKEN)

if not auth:
return jsonify(UnauthorizedError().dump({})), 401
else:
LOGGER.info("Request with API TOKEN %r", auth)

if model_name not in MODELS_NAMES:
return jsonify({"error": f"Model {model_name} doesn't exist"}), 404

batch_body = orjson.loads(request.data)

schema = {
"reports": {
"type": "list",
"minlength": 1,
"schema": {
"type": "dict",
"schema": {
"uuid": {"type": "string", "required": True},
"title": {"type": "string", "required": True},
"body": {"type": "string", "required": True},
},
},
}
}
validator = Validator()
if not validator.validate(batch_body, schema):
return jsonify({"errors": validator.errors}), 400

reports = batch_body["reports"]

status_code = 200
data = {}
missing_reports = []

for report in reports:
report_uuid = report["uuid"]
job = JobInfo(classify_broken_site_report, model_name, report_uuid)

suhaibmujahid marked this conversation as resolved.
Show resolved Hide resolved
data[report_uuid] = get_result(job)
if not data[report_uuid]:
if not is_pending(job):
missing_reports.append(report)
status_code = 202
data[report_uuid] = {"ready": False}

queueJobList: Queue = []

for i in range(0, len(missing_reports), 100):
reports = missing_reports[i : (i + 100)]
job_info, job_id, timeout = create_broken_site_report_classification_jobs(
model_name, reports
)
queueJobList.append(prepare_queue_job(job_info, job_id=job_id, timeout=timeout))
q.enqueue_many(queueJobList)

return compress_response({"reports": data}, status_code)


@application.route("/push/<path:branch>/<rev>/schedules")
@cross_origin()
def push_schedules(branch, rev):
Expand Down
41 changes: 41 additions & 0 deletions http_service/bugbug_http/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
MODELS_NAMES = [
"defectenhancementtask",
"component",
"invalidcompatibilityreport",
"needsdiagnosis",
"regression",
"stepstoreproduce",
Expand Down Expand Up @@ -170,6 +171,46 @@ def classify_issue(
return "OK"


def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str:
from bugbug_http.app import JobInfo

reports = {
report["uuid"]: {"title": report["title"], "body": report["body"]}
for report in reports_data
}

if not reports:
return "NOK"

model = MODEL_CACHE.get(model_name)

if not model:
LOGGER.info("Missing model %r, aborting" % model_name)
return "NOK"

model_extra_data = model.get_extra_data()
probs = model.classify(list(reports.values()), True)
indexes = probs.argmax(axis=-1)
suggestions = model.le.inverse_transform(indexes)

probs_list = probs.tolist()
indexes_list = indexes.tolist()
suggestions_list = suggestions.tolist()

for i, report_uuid in enumerate(reports.keys()):
data = {
"prob": probs_list[i],
"index": indexes_list[i],
"class": suggestions_list[i],
"extra_data": model_extra_data,
}

job = JobInfo(classify_broken_site_report, model_name, report_uuid)
setkey(job.result_key, orjson.dumps(data), compress=True)

return "OK"


@lru_cache(maxsize=None)
def get_known_tasks() -> tuple[str, ...]:
with open("known_tasks", "r") as f:
Expand Down
54 changes: 54 additions & 0 deletions http_service/tests/test_bug_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,60 @@ def do_request():
}


def test_model_predict_batch_broken_site_reports(client, jobs, add_result):
reports = [
{
"uuid": "954dbc23-91e6-4d6f-a10a-405f46663e31",
"title": "https://example.com",
"body": "Loads blank page.",
},
{
"uuid": "af7e27b5-3ce3-46e1-8294-5baea36782cc",
"title": "https://test.com",
"body": "Will not load",
},
]
result = {
"prob": [0.11845558881759644, 0.8815444111824036],
"index": 1,
"class": 1,
"extra_data": {},
}

def do_request():
return client.post(
"/invalidcompatibilityreport/predict/broken_site_report/batch",
data=json.dumps({"reports": reports}),
headers={API_TOKEN: "test"},
)

rv = do_request()
assert rv.status_code == 202
assert retrieve_compressed_reponse(rv) == {
"reports": {report["uuid"]: {"ready": False} for report in reports}
}
assert len(jobs) == 1

# set one of the reports as ready
keys = next(iter(jobs.values()))
add_result(keys[0], result)

rv = do_request()
assert rv.status_code == 202
assert retrieve_compressed_reponse(rv) == {
"reports": {reports[0]["uuid"]: result, reports[1]["uuid"]: {"ready": False}}
}

# set both reports as ready
add_result(keys[1], result)

rv = do_request()
assert rv.status_code == 200
assert retrieve_compressed_reponse(rv) == {
"reports": {report["uuid"]: result for report in reports}
}


def test_for_missing_bugs(client, responses):
existed_bug_ids = [1602463, 1619699]
missing_bug_ids = [1598744, 1615281, 1566486]
Expand Down