diff --git a/.github/workflows/test_benchmark_collection_models.yml b/.github/workflows/test_benchmark_collection_models.yml
index 0b14d73716..3e46af8992 100644
--- a/.github/workflows/test_benchmark_collection_models.yml
+++ b/.github/workflows/test_benchmark_collection_models.yml
@@ -52,25 +52,28 @@ jobs:
         AMICI_PARALLEL_COMPILE="" pip3 install -v --user \
             $(ls -t python/sdist/dist/amici-*.tar.gz | head -1)[petab,test,vis]
 
-    - run: |
+    - name: Install test dependencies
+      run: |
         python3 -m pip uninstall -y petab && python3 -m pip install git+https://github.com/petab-dev/libpetab-python.git@develop \
-        &&  python3 -m pip install -U sympy
+        &&  python3 -m pip install -U sympy \
+        &&  python3 -m pip install git+https://github.com/ICB-DCM/fiddy.git
 
-    # retrieve test models
-    - name: Download and test benchmark collection
+    - name: Download benchmark collection
       run: |
         git clone --depth 1 https://github.com/benchmarking-initiative/Benchmark-Models-PEtab.git \
-          && export BENCHMARK_COLLECTION="$(pwd)/Benchmark-Models-PEtab/Benchmark-Models/" \
-          && pip3 install -e $BENCHMARK_COLLECTION/../src/python \
-          && AMICI_PARALLEL_COMPILE="" tests/benchmark-models/test_benchmark_collection.sh
+          && python3 -m pip install -e Benchmark-Models-PEtab/src/python
+
+    - name: Run tests
+      env:
+        AMICI_PARALLEL_COMPILE: ""
+      run: |
+          cd tests/benchmark-models && pytest --durations=10
 
-    # run gradient checks
-    - name: Run Gradient Checks
+    # collect & upload results
+    - name: Aggregate results
       run: |
-        pip install git+https://github.com/ICB-DCM/fiddy.git \
-          && cd tests/benchmark-models && pytest --durations=10 ./test_petab_benchmark.py
+          cd tests/benchmark-models && python3 evaluate_benchmark.py
 
-    # upload results
     - uses: actions/upload-artifact@v4
       with:
         name: computation-times-${{ matrix.python-version }}-${{ matrix.extract_subexpressions }}
diff --git a/tests/benchmark-models/test_benchmark_collection.sh b/tests/benchmark-models/test_benchmark_collection.sh
deleted file mode 100755
index d3d1cad712..0000000000
--- a/tests/benchmark-models/test_benchmark_collection.sh
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/bin/bash
-# Import and run selected benchmark models with nominal parameters and check
-# agreement with reference values
-#
-# Expects environment variable BENCHMARK_COLLECTION to provide path to
-# benchmark collection model directory
-
-# Confirmed to be working
-models="
-Bachmann_MSB2011
-Beer_MolBioSystems2014
-Boehm_JProteomeRes2014
-Borghans_BiophysChem1997
-Brannmark_JBC2010
-Bruno_JExpBot2016
-Crauste_CellSystems2017
-Elowitz_Nature2000
-Fiedler_BMCSystBiol2016
-Fujita_SciSignal2010
-Isensee_JCB2018
-Lucarelli_CellSystems2018
-Schwen_PONE2014
-Smith_BMCSystBiol2013
-Sneyd_PNAS2002
-Weber_BMC2015
-Zheng_PNAS2012"
-
-#
-# not merged:
-# Becker_Science2010 (multiple models)             https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Becker_Science2010
-# Hass_PONE2017 (???)                              https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Hass_PONE2017
-# Korkut_eLIFE2015 (???)                           https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Korkut_eLIFE2015
-# Casaletto_PNAS2019 (yaml missing)                https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Casaletto_PNAS2019
-# Merkle_PCB2016 (model missing)                   https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Merkle_PCB2016
-# Parmar_PCB2019 (SBML extensions)                 https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Parmar_PCB2019
-# Swameye_PNAS2003 (splines)                       https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Swameye_PNAS2003
-# Sobotta_Frontiers2017 (???)                      https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Sobotta_Frontiers2017
-# Raia_CancerResearch2011 (state dependent sigmas) https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Raia_CancerResearch2011
-#
-# no reference value:
-# Alkan_SciSignal2018 (d2d: Alkan_DDRP_SciSignal2018)
-# Bertozzi_PNAS2020 (gh: vanako, doi: https://doi.org/10.1073/pnas.2006520117, code/data: https://github.com/gomohler/pnas2020 (R))
-# Blasi_CellSystems2016 (gh: Leonard Schmiester, doi: https://doi.org/10.1016/j.cels.2016.01.002, code/data: not available)
-# Giordano_Nature2020 (gh: Paul Jonas Jost, doi: https://doi.org/10.1038/s41591-020-0883-7, code/data: http://users.dimi.uniud.it/~giulia.giordano/docs/papers/SIDARTHEcode.zip (MATLAB))
-# Laske_PLOSComputBiol2019 (gh: Clemens Peiter, doi: https://doi.org/10.1128/JVI.00080-12 (?), code/data: ???)
-# Okuonghae_ChaosSolitonsFractals2020 (gh: Paul Jonas Jost, doi: https://doi.org/10.1016/j.chaos.2020.110032, code/data: ???)
-# Oliveira_NatCommun2021 (gh: lorenapohl, doi: https://doi.org/10.1038/s41467-020-19798-3, code: https://github.com/cidacslab/Mathematical-and-Statistical-Modeling-of-COVID19-in-Brazil (python) data: https://infovis.sei.ba.gov.br/covid19/ )
-# Perelson_Science1996 (gh: Philipp Staedter, doi: https://doi.org/10.1126/science.271.5255.1582, code/data: ???)
-# Rahman_MBS2016 (gh: Yannik Schaelte, doi: https://doi.org/10.1016/j.mbs.2016.07.009, code: not available, data: table in paper ...)
-# Raimundez_PCB2020 (gh: Elba Raimundez, doi: https://doi.org/10.1371/journal.pcbi.1007147, code/data: https://zenodo.org/record/2908234#.Y5hUUS8w3yw (d2d))
-# SalazarCavazos_MBoC2020 (gh: Dilan Pathirana, doi: https://doi.org/10.1091/mbc.E19-09-0548, code/data: supplement (BNGL))
-# Zhao_QuantBiol2020 (gh: Iva Ewert, doi: https://doi.org/10.1007/s40484-020-0199-0, code: not available, data: table in supp)
-#
-# covered by performance test:
-# Froehlich_CellSystems2018
-#
-# Unknown reasons:
-# Chen_MSB2009
-#
-
-set -e
-
-[[ -n "${BENCHMARK_COLLECTION}" ]] && model_dir="${BENCHMARK_COLLECTION}"
-
-function show_help() {
-  echo "-h: this help; -n: dry run, print commands; -b path_to_models_dir"
-}
-
-OPTIND=1
-while getopts "h?nb:" opt; do
-  case "$opt" in
-  h | \?)
-    show_help
-    exit 0
-    ;;
-  n)
-    dry_run=1
-    ;;
-  b)
-    model_dir=$OPTARG
-    ;;
-  esac
-done
-
-script_path=$(dirname "$BASH_SOURCE")
-script_path=$(cd "$script_path" && pwd)
-
-for model in $models; do
-  yaml="${model_dir}"/"${model}"/"${model}".yaml
-
-  # different naming scheme
-  if [[ "$model" == "Bertozzi_PNAS2020" ]]; then
-    yaml="${model_dir}"/"${model}"/problem.yaml
-  fi
-
-  # problems we need to flatten
-  to_flatten=(
-    "Bruno_JExpBot2016" "Chen_MSB2009" "Crauste_CellSystems2017"
-    "Fiedler_BMCSystBiol2016" "Fujita_SciSignal2010" "SalazarCavazos_MBoC2020"
-  )
-  flatten=""
-  for item in "${to_flatten[@]}"; do
-    if [[ "$item" == "$model" ]]; then
-      flatten="--flatten"
-      break
-    fi
-  done
-
-  amici_model_dir=test_bmc/"${model}"
-  mkdir -p "$amici_model_dir"
-  cmd_import="amici_import_petab ${yaml} -o ${amici_model_dir} -n ${model} ${flatten}"
-  cmd_run="$script_path/test_petab_model.py -y ${yaml} -d ${amici_model_dir} -m ${model} -c"
-
-  printf '=%.0s' {1..40}
-  printf "   %s   " "${model}"
-  printf '=%.0s' {1..40}
-  echo
-
-  if [[ -z "$dry_run" ]]; then
-    $cmd_import
-    $cmd_run
-  else
-    echo "$cmd_import"
-    echo "$cmd_run"
-  fi
-
-  printf '=%.0s' {1..100}
-  echo
-  echo
-done
-
-cd "$script_path" && python evaluate_benchmark.py
-
-# Test deprecated import from individual PEtab files
-model="Zheng_PNAS2012"
-problem_dir="${model_dir}/${model}"
-amici_model_dir=test_bmc/"${model}-deprecated"
-cmd_import="amici_import_petab -s "${problem_dir}/model_${model}.xml" \
-  -m "${problem_dir}/measurementData_${model}.tsv" \
-  -c "${problem_dir}/experimentalCondition_${model}.tsv" \
-  -p "${problem_dir}/parameters_${model}.tsv" \
-  -b "${problem_dir}/observables_${model}.tsv" \
-  -o ${amici_model_dir} -n ${model} --no-compile"
-
-if [[ -z "$dry_run" ]]; then
-  $cmd_import
-else
-  echo "$cmd_import"
-fi
diff --git a/tests/benchmark-models/test_petab_benchmark.py b/tests/benchmark-models/test_petab_benchmark.py
index 69df16f181..b4e1f50e68 100644
--- a/tests/benchmark-models/test_petab_benchmark.py
+++ b/tests/benchmark-models/test_petab_benchmark.py
@@ -1,9 +1,12 @@
-"""Tests for simulate_petab on PEtab benchmark problems."""
+"""Tests based on the PEtab benchmark problems.
 
-from pathlib import Path
+Tests simulate_petab, correctness of the log-likelihood computation at nominal
+parameters, correctness of the gradient computation, and simulation times
+for a subset of the benchmark problems.
+"""
 
+from pathlib import Path
 import fiddy
-
 import amici
 import numpy as np
 import pandas as pd
@@ -19,13 +22,33 @@
 from fiddy.derivative_check import NumpyIsCloseDerivativeCheck
 from fiddy.extensions.amici import simulate_petab_to_cached_functions
 from fiddy.success import Consistency
+import contextlib
+import logging
+import yaml
+from amici.logging import get_logger
+from amici.petab.simulations import (
+    LLH,
+    RDATAS,
+    rdatas_to_measurement_df,
+    simulate_petab,
+)
+from petab.v1.visualize import plot_problem
 
 
-repo_root = Path(__file__).parent.parent.parent
+logger = get_logger(f"amici.{__name__}", logging.WARNING)
 
-# reuse compiled models from test_benchmark_collection.sh
+script_dir = Path(__file__).parent.absolute()
+repo_root = script_dir.parent.parent
 benchmark_outdir = repo_root / "test_bmc"
-models = set(benchmark_models_petab.MODELS) - {
+
+# reference values for simulation times and log-likelihoods
+references_yaml = script_dir / "benchmark_models.yaml"
+with open(references_yaml) as f:
+    reference_values = yaml.full_load(f)
+
+# problem IDs for which to check the gradient
+# TODO: extend
+problems_for_gradient_check = set(benchmark_models_petab.MODELS) - {
     # excluded due to excessive runtime
     "Bachmann_MSB2011",
     "Chen_MSB2009",
@@ -39,15 +62,75 @@
     "Smith_BMCSystBiol2013",
     # excluded due to excessive numerical failures
     "Crauste_CellSystems2017",
-    "Fujita_SciSignal2010",
-    # FIXME: re-enable
-    "Raia_CancerResearch2011",
 }
-models = list(sorted(models))
+problems_for_gradient_check = list(sorted(problems_for_gradient_check))
+
+
+# Problems for checking the log-likelihood computation at nominal parameters
+#
+# not merged:
+# Becker_Science2010 (multiple models)             https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Becker_Science2010
+# Hass_PONE2017 (???)                              https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Hass_PONE2017
+# Korkut_eLIFE2015 (???)                           https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Korkut_eLIFE2015
+# Casaletto_PNAS2019 (yaml missing)                https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Casaletto_PNAS2019
+# Merkle_PCB2016 (model missing)                   https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Merkle_PCB2016
+# Parmar_PCB2019 (SBML extensions)                 https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Parmar_PCB2019
+# Swameye_PNAS2003 (splines)                       https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Swameye_PNAS2003
+# Sobotta_Frontiers2017 (???)                      https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Sobotta_Frontiers2017
+# Raia_CancerResearch2011 (state dependent sigmas) https://github.com/Benchmarking-Initiative/Benchmark-Models-PEtab/tree/model_Raia_CancerResearch2011
+#
+# no reference value:
+# Alkan_SciSignal2018 (d2d: Alkan_DDRP_SciSignal2018)
+# Bertozzi_PNAS2020 (gh: vanako, doi: https://doi.org/10.1073/pnas.2006520117, code/data: https://github.com/gomohler/pnas2020 (R))
+# Blasi_CellSystems2016 (gh: Leonard Schmiester, doi: https://doi.org/10.1016/j.cels.2016.01.002, code/data: not available)
+# Giordano_Nature2020 (gh: Paul Jonas Jost, doi: https://doi.org/10.1038/s41591-020-0883-7, code/data: http://users.dimi.uniud.it/~giulia.giordano/docs/papers/SIDARTHEcode.zip (MATLAB))
+# Laske_PLOSComputBiol2019 (gh: Clemens Peiter, doi: https://doi.org/10.1128/JVI.00080-12 (?), code/data: ???)
+# Okuonghae_ChaosSolitonsFractals2020 (gh: Paul Jonas Jost, doi: https://doi.org/10.1016/j.chaos.2020.110032, code/data: ???)
+# Oliveira_NatCommun2021 (gh: lorenapohl, doi: https://doi.org/10.1038/s41467-020-19798-3, code: https://github.com/cidacslab/Mathematical-and-Statistical-Modeling-of-COVID19-in-Brazil (python) data: https://infovis.sei.ba.gov.br/covid19/ )
+# Perelson_Science1996 (gh: Philipp Staedter, doi: https://doi.org/10.1126/science.271.5255.1582, code/data: ???)
+# Rahman_MBS2016 (gh: Yannik Schaelte, doi: https://doi.org/10.1016/j.mbs.2016.07.009, code: not available, data: table in paper ...)
+# Raimundez_PCB2020 (gh: Elba Raimundez, doi: https://doi.org/10.1371/journal.pcbi.1007147, code/data: https://zenodo.org/record/2908234#.Y5hUUS8w3yw (d2d))
+# SalazarCavazos_MBoC2020 (gh: Dilan Pathirana, doi: https://doi.org/10.1091/mbc.E19-09-0548, code/data: supplement (BNGL))
+# Zhao_QuantBiol2020 (gh: Iva Ewert, doi: https://doi.org/10.1007/s40484-020-0199-0, code: not available, data: table in supp)
+#
+# covered by performance test:
+# Froehlich_CellSystems2018
+#
+# Unknown reasons:
+# Chen_MSB2009
+#
+# Confirmed to be working:
+# TODO: extend
+problems_for_llh_check = [
+    "Bachmann_MSB2011",
+    "Beer_MolBioSystems2014",
+    "Boehm_JProteomeRes2014",
+    "Borghans_BiophysChem1997",
+    "Brannmark_JBC2010",
+    "Bruno_JExpBot2016",
+    "Crauste_CellSystems2017",
+    "Elowitz_Nature2000",
+    "Fiedler_BMCSystBiol2016",
+    "Fujita_SciSignal2010",
+    "Isensee_JCB2018",
+    "Lucarelli_CellSystems2018",
+    "Schwen_PONE2014",
+    "Smith_BMCSystBiol2013",
+    "Sneyd_PNAS2002",
+    "Weber_BMC2015",
+    "Zheng_PNAS2012",
+]
+
+# all PEtab problems we need to import
+problems = list(
+    sorted(set(problems_for_gradient_check + problems_for_llh_check))
+)
 
 
 @dataclass
 class GradientCheckSettings:
+    """Problem-specific settings for gradient checks."""
+
     # Absolute and relative tolerances for simulation
     atol_sim: float = 1e-16
     rtol_sim: float = 1e-12
@@ -93,6 +176,10 @@ class GradientCheckSettings:
 settings["Brannmark_JBC2010"] = GradientCheckSettings(
     ss_sensitivity_mode=amici.SteadyStateSensitivityMode.integrationOnly,
 )
+settings["Fujita_SciSignal2010"] = GradientCheckSettings(
+    atol_check=1e-7,
+    rtol_check=5e-4,
+)
 settings["Giordano_Nature2020"] = GradientCheckSettings(
     atol_check=1e-6, rtol_check=1e-3, rng_seed=1
 )
@@ -107,6 +194,10 @@ class GradientCheckSettings:
     atol_sim=1e-12,
     rtol_sim=1e-12,
 )
+settings["Raia_CancerResearch2011"] = GradientCheckSettings(
+    atol_check=1e-10,
+    rtol_check=1e-3,
+)
 settings["Smith_BMCSystBiol2013"] = GradientCheckSettings(
     atol_sim=1e-10,
     rtol_sim=1e-10,
@@ -140,6 +231,175 @@ class GradientCheckSettings:
     debug_path.mkdir(exist_ok=True, parents=True)
 
 
+@pytest.fixture(scope="session", params=problems, ids=problems)
+def benchmark_problem(request):
+    """Fixture providing model and PEtab problem for a problem from
+    the benchmark problem collection."""
+    problem_id = request.param
+    petab_problem = benchmark_models_petab.get_problem(problem_id)
+    if measurement_table_has_timepoint_specific_mappings(
+        petab_problem.measurement_df,
+    ):
+        petab.flatten_timepoint_specific_output_overrides(petab_problem)
+
+    # Setup AMICI objects.
+    amici_model = import_petab_problem(
+        petab_problem,
+        model_output_dir=benchmark_outdir / problem_id,
+    )
+    return problem_id, petab_problem, amici_model
+
+
+@pytest.mark.filterwarnings(
+    "ignore:divide by zero encountered in log",
+    # https://github.com/AMICI-dev/AMICI/issues/18
+    "ignore:Adjoint sensitivity analysis for models with discontinuous "
+    "right hand sides .*:UserWarning",
+)
+def test_nominal_parameters_llh(benchmark_problem):
+    """Test the log-likelihood computation at nominal parameters.
+
+    Also check that the simulation time is within the reference range.
+    """
+    problem_id, petab_problem, amici_model = benchmark_problem
+    if problem_id not in problems_for_llh_check:
+        pytest.skip("Excluded from log-likelihood check.")
+
+    amici_solver = amici_model.getSolver()
+    amici_solver.setAbsoluteTolerance(1e-8)
+    amici_solver.setRelativeTolerance(1e-8)
+    amici_solver.setMaxSteps(10_000)
+    if problem_id in ("Brannmark_JBC2010", "Isensee_JCB2018"):
+        amici_model.setSteadyStateSensitivityMode(
+            amici.SteadyStateSensitivityMode.integrationOnly
+        )
+
+    times = dict()
+
+    for label, sensi_mode in {
+        "t_sim": amici.SensitivityMethod.none,
+        "t_fwd": amici.SensitivityMethod.forward,
+        "t_adj": amici.SensitivityMethod.adjoint,
+    }.items():
+        amici_solver.setSensitivityMethod(sensi_mode)
+        if sensi_mode == amici.SensitivityMethod.none:
+            amici_solver.setSensitivityOrder(amici.SensitivityOrder.none)
+        else:
+            amici_solver.setSensitivityOrder(amici.SensitivityOrder.first)
+
+        res_repeats = [
+            simulate_petab(
+                petab_problem=petab_problem,
+                amici_model=amici_model,
+                solver=amici_solver,
+                log_level=logging.DEBUG,
+            )
+            for _ in range(3)  # repeat to get more stable timings
+        ]
+        res = res_repeats[0]
+
+        times[label] = np.min(
+            [
+                sum(r.cpu_time + r.cpu_timeB for r in res[RDATAS]) / 1000
+                # only forwards/backwards simulation
+                for res in res_repeats
+            ]
+        )
+
+        if sensi_mode == amici.SensitivityMethod.none:
+            rdatas = res[RDATAS]
+            llh = res[LLH]
+    # TODO: check that all llh match, check that all sllh match
+    times["np"] = sum(petab_problem.parameter_df[petab.ESTIMATE])
+
+    pd.Series(times).to_csv(script_dir / f"{problem_id}_benchmark.csv")
+    for rdata in rdatas:
+        assert (
+            rdata.status == amici.AMICI_SUCCESS
+        ), f"Simulation failed for {rdata.id}"
+
+    if debug:
+        # create simulation PEtab table
+        sim_df = rdatas_to_measurement_df(
+            rdatas=rdatas,
+            model=amici_model,
+            measurement_df=petab_problem.measurement_df,
+        )
+        sim_df.rename(
+            columns={petab.MEASUREMENT: petab.SIMULATION}, inplace=True
+        )
+        sim_df.to_csv(
+            benchmark_outdir / problem_id / f"{problem_id}_simulation.tsv",
+            index=False,
+            sep="\t",
+        )
+
+        # visualize fit, save to file
+        with contextlib.suppress(NotImplementedError):
+            axs = plot_problem(
+                petab_problem=petab_problem, simulations_df=sim_df
+            )
+            for plot_id, ax in axs.items():
+                fig_path = (
+                    benchmark_outdir
+                    / problem_id
+                    / f"{problem_id}_{plot_id}_vis.png"
+                )
+                logger.info(f"Saving figure to {fig_path}")
+                ax.get_figure().savefig(fig_path, dpi=150)
+
+    try:
+        ref_llh = reference_values[problem_id]["llh"]
+        rdiff = np.abs((llh - ref_llh) / ref_llh)
+        rtol = 1e-3
+        adiff = np.abs(llh - ref_llh)
+        atol = 1e-3
+        tolstr = (
+            f" Absolute difference is {adiff:.2e} "
+            f"(tol {atol:.2e}) and relative difference is "
+            f"{rdiff:.2e} (tol {rtol:.2e})."
+        )
+
+        if np.isclose(llh, ref_llh, rtol=rtol, atol=atol):
+            logger.info(
+                f"Computed llh {llh:.4e} matches reference {ref_llh:.4e}."
+                + tolstr
+            )
+        else:
+            pytest.fail(
+                f"Computed llh {llh:.4e} does not match reference "
+                f"{ref_llh:.4e}." + tolstr
+            )
+    except KeyError:
+        logger.error(
+            "No reference likelihood found for "
+            f"{problem_id} in {references_yaml}"
+        )
+
+    for label, key in {
+        "simulation": "t_sim",
+        "adjoint sensitivity": "t_adj",
+        "forward sensitivity": "t_fwd",
+    }.items():
+        try:
+            ref = reference_values[problem_id][key]
+            if times[key] > ref:
+                pytest.fail(
+                    f"Computation time for {label} ({times[key]:.2e}) "
+                    f"exceeds reference ({ref:.2e})."
+                )
+            else:
+                logger.info(
+                    f"Computation time for {label} ({times[key]:.2e}) "
+                    f"within reference ({ref:.2e})."
+                )
+        except KeyError:
+            logger.error(
+                f"No reference time for {label} found for "
+                f"{problem_id} in {references_yaml}"
+            )
+
+
 @pytest.mark.filterwarnings(
     "ignore:divide by zero encountered in log",
     # https://github.com/AMICI-dev/AMICI/issues/18
@@ -152,9 +412,14 @@ class GradientCheckSettings:
     (SensitivityMethod.forward, SensitivityMethod.adjoint),
     ids=["forward", "adjoint"],
 )
-@pytest.mark.parametrize("model", models)
-def test_benchmark_gradient(model, scale, sensitivity_method, request):
-    if not scale and model in (
+def test_benchmark_gradient(
+    benchmark_problem, scale, sensitivity_method, request
+):
+    problem_id, petab_problem, amici_model = benchmark_problem
+    if problem_id not in problems_for_gradient_check:
+        pytest.skip("Excluded from gradient check.")
+
+    if not scale and problem_id in (
         "Smith_BMCSystBiol2013",
         "Brannmark_JBC2010",
         "Elowitz_Nature2000",
@@ -165,10 +430,10 @@ def test_benchmark_gradient(model, scale, sensitivity_method, request):
     ):
         # not really worth the effort trying to fix these cases if they
         # only fail on linear scale
-        pytest.skip()
+        pytest.skip("scale=False disabled for this problem")
 
     if (
-        model
+        problem_id
         in (
             # events with parameter-dependent triggers
             #  https://github.com/AMICI-dev/AMICI/issues/18
@@ -176,10 +441,9 @@ def test_benchmark_gradient(model, scale, sensitivity_method, request):
         )
         and sensitivity_method == SensitivityMethod.adjoint
     ):
-        # FIXME
-        pytest.skip()
+        pytest.skip("Unsupported ASA+events")
 
-    petab_problem = benchmark_models_petab.get_problem(model)
+    petab_problem = benchmark_models_petab.get_problem(problem_id)
     if measurement_table_has_timepoint_specific_mappings(
         petab_problem.measurement_df,
     ):
@@ -187,12 +451,12 @@ def test_benchmark_gradient(model, scale, sensitivity_method, request):
 
     # Only compute gradient for estimated parameters.
     parameter_ids = petab_problem.x_free_ids
-    cur_settings = settings[model]
+    cur_settings = settings[problem_id]
 
     # Setup AMICI objects.
     amici_model = import_petab_problem(
         petab_problem,
-        model_output_dir=benchmark_outdir / model,
+        model_output_dir=benchmark_outdir / problem_id,
     )
     amici_solver = amici_model.getSolver()
     amici_solver.setAbsoluteTolerance(cur_settings.atol_sim)
diff --git a/tests/benchmark-models/test_petab_model.py b/tests/benchmark-models/test_petab_model.py
deleted file mode 100755
index 125a046a5e..0000000000
--- a/tests/benchmark-models/test_petab_model.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Simulate a PEtab problem and compare results to reference values
-"""
-
-import argparse
-import contextlib
-import importlib
-import logging
-import os
-import sys
-from pathlib import Path
-
-import amici
-import numpy as np
-import pandas as pd
-import petab.v1 as petab
-import yaml
-from amici.logging import get_logger
-from amici.petab.simulations import (
-    LLH,
-    RDATAS,
-    rdatas_to_measurement_df,
-    simulate_petab,
-)
-from petab.v1.visualize import plot_problem
-from petab.v1.lint import measurement_table_has_timepoint_specific_mappings
-
-logger = get_logger(f"amici.{__name__}", logging.WARNING)
-
-
-def parse_cli_args():
-    """Parse command line arguments
-
-    Returns:
-        Parsed CLI arguments from ``argparse``.
-    """
-
-    parser = argparse.ArgumentParser(
-        description="Simulate PEtab-format model using AMICI."
-    )
-
-    # General options:
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        dest="verbose",
-        action="store_true",
-        help="More verbose output",
-    )
-    parser.add_argument(
-        "-c",
-        "--check",
-        dest="check",
-        action="store_true",
-        help="Compare to reference value",
-    )
-    parser.add_argument(
-        "-p",
-        "--plot",
-        dest="plot",
-        action="store_true",
-        help="Plot measurement and simulation results",
-    )
-
-    # PEtab problem
-    parser.add_argument(
-        "-y",
-        "--yaml",
-        dest="yaml_file_name",
-        required=True,
-        help="PEtab YAML problem filename",
-    )
-
-    # Corresponding AMICI model
-    parser.add_argument(
-        "-m",
-        "--model-name",
-        dest="model_name",
-        help="Name of the AMICI module of the model to " "simulate.",
-        required=True,
-    )
-    parser.add_argument(
-        "-d",
-        "--model-dir",
-        dest="model_directory",
-        help="Directory containing the AMICI module of the "
-        "model to simulate. Required if model is not "
-        "in python path.",
-    )
-
-    parser.add_argument(
-        "-o",
-        "--simulation-file",
-        dest="simulation_file",
-        help="File to write simulation result to, in PEtab"
-        "measurement table format.",
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    """Simulate the model specified on the command line"""
-    script_dir = Path(__file__).parent.absolute()
-    args = parse_cli_args()
-    loglevel = logging.DEBUG if args.verbose else logging.INFO
-    logger.setLevel(loglevel)
-
-    logger.info(
-        f"Simulating '{args.model_name}' "
-        f"({args.model_directory}) using PEtab data from "
-        f"{args.yaml_file_name}"
-    )
-
-    # load PEtab files
-    problem = petab.Problem.from_yaml(args.yaml_file_name)
-
-    if measurement_table_has_timepoint_specific_mappings(
-        problem.measurement_df
-    ):
-        petab.flatten_timepoint_specific_output_overrides(problem)
-
-    # load model
-    if args.model_directory:
-        sys.path.insert(0, args.model_directory)
-    model_module = importlib.import_module(args.model_name)
-    amici_model = model_module.getModel()
-    amici_solver = amici_model.getSolver()
-
-    amici_solver.setAbsoluteTolerance(1e-8)
-    amici_solver.setRelativeTolerance(1e-8)
-    amici_solver.setMaxSteps(int(1e4))
-    if args.model_name in ("Brannmark_JBC2010", "Isensee_JCB2018"):
-        amici_model.setSteadyStateSensitivityMode(
-            amici.SteadyStateSensitivityMode.integrationOnly
-        )
-
-    times = dict()
-
-    for label, sensi_mode in {
-        "t_sim": amici.SensitivityMethod.none,
-        "t_fwd": amici.SensitivityMethod.forward,
-        "t_adj": amici.SensitivityMethod.adjoint,
-    }.items():
-        amici_solver.setSensitivityMethod(sensi_mode)
-        if sensi_mode == amici.SensitivityMethod.none:
-            amici_solver.setSensitivityOrder(amici.SensitivityOrder.none)
-        else:
-            amici_solver.setSensitivityOrder(amici.SensitivityOrder.first)
-
-        res_repeats = [
-            simulate_petab(
-                petab_problem=problem,
-                amici_model=amici_model,
-                solver=amici_solver,
-                log_level=loglevel,
-            )
-            for _ in range(3)  # repeat to get more stable timings
-        ]
-        res = res_repeats[0]
-
-        times[label] = np.min(
-            [
-                sum(r.cpu_time + r.cpu_timeB for r in res[RDATAS]) / 1000
-                # only forwards/backwards simulation
-                for res in res_repeats
-            ]
-        )
-
-        if sensi_mode == amici.SensitivityMethod.none:
-            rdatas = res[RDATAS]
-            llh = res[LLH]
-
-    times["np"] = sum(problem.parameter_df[petab.ESTIMATE])
-
-    pd.Series(times).to_csv(script_dir / f"{args.model_name}_benchmark.csv")
-    for rdata in rdatas:
-        assert (
-            rdata.status == amici.AMICI_SUCCESS
-        ), f"Simulation failed for {rdata.id}"
-
-    # create simulation PEtab table
-    sim_df = rdatas_to_measurement_df(
-        rdatas=rdatas, model=amici_model, measurement_df=problem.measurement_df
-    )
-    sim_df.rename(columns={petab.MEASUREMENT: petab.SIMULATION}, inplace=True)
-
-    if args.simulation_file:
-        sim_df.to_csv(args.simulation_file, index=False, sep="\t")
-
-    if args.plot:
-        with contextlib.suppress(NotImplementedError):
-            # visualize fit
-            axs = plot_problem(petab_problem=problem, simulations_df=sim_df)
-
-            # save figure
-            for plot_id, ax in axs.items():
-                fig_path = os.path.join(
-                    args.model_directory,
-                    f"{args.model_name}_{plot_id}_vis.png",
-                )
-                logger.info(f"Saving figure to {fig_path}")
-                ax.get_figure().savefig(fig_path, dpi=150)
-
-    if args.check:
-        references_yaml = script_dir / "benchmark_models.yaml"
-        with open(references_yaml) as f:
-            refs = yaml.full_load(f)
-
-        try:
-            ref_llh = refs[args.model_name]["llh"]
-
-            rdiff = np.abs((llh - ref_llh) / ref_llh)
-            rtol = 1e-3
-            adiff = np.abs(llh - ref_llh)
-            atol = 1e-3
-            tolstr = (
-                f" Absolute difference is {adiff:.2e} "
-                f"(tol {atol:.2e}) and relative difference is "
-                f"{rdiff:.2e} (tol {rtol:.2e})."
-            )
-
-            if np.isclose(llh, ref_llh, rtol=rtol, atol=atol):
-                logger.info(
-                    f"Computed llh {llh:.4e} matches reference {ref_llh:.4e}."
-                    + tolstr
-                )
-            else:
-                logger.error(
-                    f"Computed llh {llh:.4e} does not match reference "
-                    f"{ref_llh:.4e}." + tolstr
-                )
-                sys.exit(1)
-        except KeyError:
-            logger.error(
-                "No reference likelihood found for "
-                f"{args.model_name} in {references_yaml}"
-            )
-
-        for label, key in {
-            "simulation": "t_sim",
-            "adjoint sensitivity": "t_adj",
-            "forward sensitivity": "t_fwd",
-        }.items():
-            try:
-                ref = refs[args.model_name][key]
-                if times[key] > ref:
-                    logger.error(
-                        f"Computation time for {label} ({times[key]:.2e}) "
-                        f"exceeds reference ({ref:.2e})."
-                    )
-                    sys.exit(1)
-                else:
-                    logger.info(
-                        f"Computation time for {label} ({times[key]:.2e}) "
-                        f"within reference ({ref:.2e})."
-                    )
-            except KeyError:
-                logger.error(
-                    f"No reference time for {label} found for "
-                    f"{args.model_name} in {references_yaml}"
-                )
-
-
-if __name__ == "__main__":
-    main()