mozilla · suhaibmujahid · Nov 29, 2023 · Nov 23, 2023
diff --git a/http_service/bugbug_http/app.py b/http_service/bugbug_http/app.py
@@ -30,6 +30,7 @@
 from bugbug import bugzilla, get_bugbug_version, utils
 from bugbug_http.models import (
     MODELS_NAMES,
+    classify_broken_site_report,
     classify_bug,
     classify_issue,
     get_config_specific_groups,
@@ -230,6 +231,29 @@ def create_bug_classification_jobs(
     )
 
 
+def create_broken_site_report_classification_jobs(
+    model_name: str, reports: list[dict]
+) -> tuple[JobInfo, str, int]:
+    """Create job_id and redis connection"""
+    job_id = get_job_id()
+
+    # Set the mapping before queuing to avoid some race conditions
+    job_id_mapping = {}
+    for report in reports:
+        key = JobInfo(
+            classify_broken_site_report, model_name, report["uuid"]
+        ).mapping_key
+        job_id_mapping[key] = job_id
+
+    redis_conn.mset(job_id_mapping)
+
+    return (
+        JobInfo(classify_broken_site_report, model_name, reports),
+        job_id,
+        JOB_TIMEOUT,
+    )
+
+
 def schedule_issue_classification(
     model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
 ) -> None:
@@ -720,6 +744,191 @@ def batch_prediction(model_name):
     return compress_response({"bugs": data}, status_code)
 
 
+@application.route("/<model_name>/predict/broken_site_report/batch", methods=["POST"])
+@cross_origin()
+def batch_prediction_broken_site_report(model_name):
+    """
+    ---
+    post:
+      description: >
+        Post a batch of reports to classify, answer either 200 if all are are
+        processed or 202 if at least one report is not processed.
+        <br/><br/>
+        Starts by sending a batch of reports like this:<br/>
+        ```
+        {"reports": [{"uuid": "954dbc23-91e6-4d6f-a10a-405f46663e31", "title: "https://example.com", "body": "Loads blank page."}]}
+        ```<br/><br>
+
+        You will likely get a 202 answer that indicates that no result is
+        available yet for any of the reports id you provided with the following
+        body:<br/>
+
+        ```
+        {"reports": {"<uuid 1>": {ready: False}, "<uuid 2>": {ready: False}}}
+        ```<br/><br/>
+
+        Call back the same endpoint with the same uuids a bit it later, and you
+        will get the results.<br/><br/>
+
+        You might get the following output if some bugs are not available:
+        <br/>
+
+        ```
+        {"reports": {"<uuid 1>": {"available": False}}}
+        ```<br/><br/>
+
+        And you will get the following output once the bugs are available:
+        <br/>
+        ```
+        {"reports": {"<uuid 1>": {"extra_data": {}, "index": 0, "prob": [0], "suggestion": ""}}}
+        ```<br/><br/>
+
+        Please be aware that each report could be in a different state, so the
+        following output, where a report is returned and another one is still
+        being processed, is valid:
+        <br/>
+        ```
+        {"reports": {"<uuid 1>": {"available": False}, "<uuid 2>": {"extra_data": {}, "index": 0, "prob": [0], "suggestion": ""}}}
+        ```
+      summary: Classify a batch of reports
+      parameters:
+      - name: model_name
+        in: path
+        schema: ModelName
+      requestBody:
+        description: The list of reports to classify
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+              reports:
+                type: array
+                items:
+                  type: object
+                  properties:
+                    uuid:
+                      type: string
+                    title:
+                      type: string
+                    body:
+                      type: string
+            examples:
+              cat:
+                summary: An example of payload
+                value:
+                  reports:
+                    - uuid: "954dbc23-91e6-4d6f-a10a-405f46663e31"
+                      title: "https://example.com"
+                      body: "Loads blank page."
+      responses:
+        200:
+          description: A list of results
+          content:
+            application/json:
+              schema:
+                type: object
+                additionalProperties: true
+                example:
+                  reports:
+                    <uuid 1>:
+                      extra_data: {}
+                      index: 0
+                      prob: [0]
+                      suggestion: string
+                    <uuid 2>:
+                      extra_data: {}
+                      index: 0
+                      prob: [0]
+                      suggestion: string
+        202:
+          description: A temporary answer for reports being processed
+          content:
+            application/json:
+              schema:
+                type: object
+                items:
+                    type: object
+                    properties:
+                      ready:
+                        type: boolean
+                        enum: [False]
+                example:
+                  reports:
+                    <uuid 1>:
+                      extra_data: {}
+                      index: 0
+                      prob: [0]
+                      suggestion: string
+                    <uuid 2>: {ready: False}
+        401:
+          description: API key is missing
+          content:
+            application/json:
+              schema: UnauthorizedError
+    """
+    headers = request.headers
+
+    auth = headers.get(API_TOKEN)
+
+    if not auth:
+        return jsonify(UnauthorizedError().dump({})), 401
+    else:
+        LOGGER.info("Request with API TOKEN %r", auth)
+
+    if model_name not in MODELS_NAMES:
+        return jsonify({"error": f"Model {model_name} doesn't exist"}), 404
+
+    batch_body = orjson.loads(request.data)
+
+    schema = {
+        "reports": {
+            "type": "list",
+            "minlength": 1,
+            "schema": {
+                "type": "dict",
+                "schema": {
+                    "uuid": {"type": "string", "required": True},
+                    "title": {"type": "string", "required": True},
+                    "body": {"type": "string", "required": True},
+                },
+            },
+        }
+    }
+    validator = Validator()
+    if not validator.validate(batch_body, schema):
+        return jsonify({"errors": validator.errors}), 400
+
+    reports = batch_body["reports"]
+
+    status_code = 200
+    data = {}
+    missing_reports = []
+
+    for report in reports:
+        report_uuid = report["uuid"]
+        job = JobInfo(classify_broken_site_report, model_name, report_uuid)
+
+        data[report_uuid] = get_result(job)
+        if not data[report_uuid]:
+            if not is_pending(job):
+                missing_reports.append(report)
+            status_code = 202
+            data[report_uuid] = {"ready": False}
+
+    queueJobList: Queue = []
+
+    for i in range(0, len(missing_reports), 100):
+        reports = missing_reports[i : (i + 100)]
+        job_info, job_id, timeout = create_broken_site_report_classification_jobs(
+            model_name, reports
+        )
+        queueJobList.append(prepare_queue_job(job_info, job_id=job_id, timeout=timeout))
+    q.enqueue_many(queueJobList)
+
+    return compress_response({"reports": data}, status_code)
+
+
 @application.route("/push/<path:branch>/<rev>/schedules")
 @cross_origin()
 def push_schedules(branch, rev):

diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py
@@ -27,6 +27,7 @@
 MODELS_NAMES = [
     "defectenhancementtask",
     "component",
+    "invalidcompatibilityreport",
     "needsdiagnosis",
     "regression",
     "stepstoreproduce",
@@ -170,6 +171,46 @@ def classify_issue(
     return "OK"
 
 
+def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str:
+    from bugbug_http.app import JobInfo
+
+    reports = {
+        report["uuid"]: {"title": report["title"], "body": report["body"]}
+        for report in reports_data
+    }
+
+    if not reports:
+        return "NOK"
+
+    model = MODEL_CACHE.get(model_name)
+
+    if not model:
+        LOGGER.info("Missing model %r, aborting" % model_name)
+        return "NOK"
+
+    model_extra_data = model.get_extra_data()
+    probs = model.classify(list(reports.values()), True)
+    indexes = probs.argmax(axis=-1)
+    suggestions = model.le.inverse_transform(indexes)
+
+    probs_list = probs.tolist()
+    indexes_list = indexes.tolist()
+    suggestions_list = suggestions.tolist()
+
+    for i, report_uuid in enumerate(reports.keys()):
+        data = {
+            "prob": probs_list[i],
+            "index": indexes_list[i],
+            "class": suggestions_list[i],
+            "extra_data": model_extra_data,
+        }
+
+        job = JobInfo(classify_broken_site_report, model_name, report_uuid)
+        setkey(job.result_key, orjson.dumps(data), compress=True)
+
+    return "OK"
+
+
 @lru_cache(maxsize=None)
 def get_known_tasks() -> tuple[str, ...]:
     with open("known_tasks", "r") as f:

diff --git a/http_service/tests/test_bug_classification.py b/http_service/tests/test_bug_classification.py
@@ -159,6 +159,60 @@ def do_request():
     }
 
 
+def test_model_predict_batch_broken_site_reports(client, jobs, add_result):
+    reports = [
+        {
+            "uuid": "954dbc23-91e6-4d6f-a10a-405f46663e31",
+            "title": "https://example.com",
+            "body": "Loads blank page.",
+        },
+        {
+            "uuid": "af7e27b5-3ce3-46e1-8294-5baea36782cc",
+            "title": "https://test.com",
+            "body": "Will not load",
+        },
+    ]
+    result = {
+        "prob": [0.11845558881759644, 0.8815444111824036],
+        "index": 1,
+        "class": 1,
+        "extra_data": {},
+    }
+
+    def do_request():
+        return client.post(
+            "/invalidcompatibilityreport/predict/broken_site_report/batch",
+            data=json.dumps({"reports": reports}),
+            headers={API_TOKEN: "test"},
+        )
+
+    rv = do_request()
+    assert rv.status_code == 202
+    assert retrieve_compressed_reponse(rv) == {
+        "reports": {report["uuid"]: {"ready": False} for report in reports}
+    }
+    assert len(jobs) == 1
+
+    # set one of the reports as ready
+    keys = next(iter(jobs.values()))
+    add_result(keys[0], result)
+
+    rv = do_request()
+    assert rv.status_code == 202
+    assert retrieve_compressed_reponse(rv) == {
+        "reports": {reports[0]["uuid"]: result, reports[1]["uuid"]: {"ready": False}}
+    }
+
+    # set both reports as ready
+    add_result(keys[1], result)
+
+    rv = do_request()
+    assert rv.status_code == 200
+    assert retrieve_compressed_reponse(rv) == {
+        "reports": {report["uuid"]: result for report in reports}
+    }
+
+
 def test_for_missing_bugs(client, responses):
     existed_bug_ids = [1602463, 1619699]
     missing_bug_ids = [1598744, 1615281, 1566486]