From daf72c7bf3363caecf3b2f614f23136a36ef72a8 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 27 Aug 2023 01:37:04 +0800 Subject: [PATCH] feat(exporter): add Prometheus exporter (#92) --- .github/workflows/build.yaml | 42 +- .github/workflows/lint.yaml | 27 + .pre-commit-config.yaml | 4 + .pylintrc | 3 +- CHANGELOG.md | 1 + docs/source/spelling_wordlist.txt | 2 + nvitop-exporter/LICENSE | 202 +++++++ nvitop-exporter/MANIFEST.in | 1 + nvitop-exporter/README.md | 11 + nvitop-exporter/nvitop_exporter/__init__.py | 24 + nvitop-exporter/nvitop_exporter/__main__.py | 25 + nvitop-exporter/nvitop_exporter/cli.py | 240 ++++++++ nvitop-exporter/nvitop_exporter/exporter.py | 608 ++++++++++++++++++++ nvitop-exporter/nvitop_exporter/utils.py | 38 ++ nvitop-exporter/nvitop_exporter/version.py | 54 ++ nvitop-exporter/pyproject.toml | 83 +++ nvitop-exporter/requirements.txt | 2 + nvitop-exporter/setup.py | 44 ++ nvitop/api/__init__.py | 58 +- nvitop/api/device.py | 10 +- nvitop/api/utils.py | 18 +- nvitop/cli.py | 2 +- nvitop/version.py | 2 +- pyproject.toml | 11 +- 24 files changed, 1475 insertions(+), 37 deletions(-) create mode 100644 nvitop-exporter/LICENSE create mode 100644 nvitop-exporter/MANIFEST.in create mode 100644 nvitop-exporter/README.md create mode 100644 nvitop-exporter/nvitop_exporter/__init__.py create mode 100644 nvitop-exporter/nvitop_exporter/__main__.py create mode 100644 nvitop-exporter/nvitop_exporter/cli.py create mode 100644 nvitop-exporter/nvitop_exporter/exporter.py create mode 100644 nvitop-exporter/nvitop_exporter/utils.py create mode 100644 nvitop-exporter/nvitop_exporter/version.py create mode 100644 nvitop-exporter/pyproject.toml create mode 100644 nvitop-exporter/requirements.txt create mode 100755 nvitop-exporter/setup.py diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f1d16efd..1521ad8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -72,15 +72,22 @@ jobs: python -m venv venv && ( source venv/bin/activate && - python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions + python -m pip install --upgrade pip setuptools pre-commit pylint[spelling] mypy typing-extensions && python -m pip install -r requirements.txt && + python -m pip install -r nvitop-exporter/requirements.txt && python -m pre_commit install --install-hooks && python -m pre_commit run --all-files && python -c 'import nvitop' && python -m nvitop --version && python -m nvitop --help && python -m nvitop.select --version && - python -m nvitop.select --help + python -m nvitop.select --help && + ( + cd nvitop-exporter && + python -c 'import nvitop_exporter' && + python -m nvitop_exporter --version && + python -m nvitop_exporter --help + ) ) - name: Test docker build @@ -92,12 +99,17 @@ jobs: if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' run: | sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py + sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py - name: Print version - run: python setup.py --version + run: | + python setup.py --version + python nvitop-exporter/setup.py --version - name: Build sdist and wheels - run: python -m build + run: | + python -m build --outdir dist . + python -m build --outdir dist nvitop-exporter - name: List built sdist and wheels run: ls -lh dist/ @@ -135,15 +147,23 @@ jobs: if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' run: | sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop/version.py + sed -i -E 's/^__release__\s*=.*$/__release__ = True/' nvitop-exporter/nvitop_exporter/version.py - name: Print version - run: python setup.py --version + run: | + python setup.py --version + python nvitop-exporter/setup.py --version - name: Check consistency between the package version and release tag if: startsWith(github.ref, 'refs/tags/') run: | - PACKAGE_VER="v$(python setup.py --version)" RELEASE_TAG="${GITHUB_REF#refs/*/}" + PACKAGE_VER="v$(python setup.py --version)" + if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then + echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})" + exit 1 + fi + PACKAGE_VER="v$(python nvitop-exporter/setup.py --version)" if [[ "${PACKAGE_VER}" != "${RELEASE_TAG}" ]]; then echo "package ver. (${PACKAGE_VER}) != release tag. (${RELEASE_TAG})" exit 1 @@ -163,10 +183,10 @@ jobs: with: user: __token__ password: ${{ secrets.TESTPYPI_UPLOAD_TOKEN }} - repository_url: https://test.pypi.org/legacy/ + repository-url: https://test.pypi.org/legacy/ verbose: true - print_hash: true - skip_existing: true + print-hash: true + skip-existing: true - name: Publish to PyPI if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' @@ -175,5 +195,5 @@ jobs: user: __token__ password: ${{ secrets.PYPI_UPLOAD_TOKEN }} verbose: true - print_hash: true - skip_existing: true + print-hash: true + skip-existing: true diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 8c5a8cd1..9af4d355 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -40,6 +40,10 @@ jobs: - name: Check syntax (Python 3.7) run: | "${{ steps.py37.outputs.python-path }}" -m compileall nvitop + ( + cd nvitop-exporter && + "${{ steps.py37.outputs.python-path }}" -m compileall nvitop_exporter + ) - name: Upgrade pip run: | @@ -67,6 +71,29 @@ jobs: "${{ steps.py37.outputs.python-path }}" -m nvitop.select --version "${{ steps.py37.outputs.python-path }}" -m nvitop.select --help + - name: Install dependencies for nvitop-exporter + run: | + python -m pip install -r nvitop-exporter/requirements.txt + + - name: Import tests for nvitop-exporter + run: | + ( + cd nvitop-exporter && + python -c 'import nvitop_exporter' && + python -m nvitop_exporter --version && + python -m nvitop_exporter --help + ) + + - name: Import tests for nvitop-exporter (Python 3.7) + run: | + ( + cd nvitop-exporter && + "${{ steps.py37.outputs.python-path }}" -m pip install -r requirements.txt && + "${{ steps.py37.outputs.python-path }}" -c 'import nvitop_exporter' && + "${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --version && + "${{ steps.py37.outputs.python-path }}" -m nvitop_exporter --help + ) + - name: Install linters run: | python -m pip install --upgrade pre-commit pylint[spelling] mypy typing-extensions diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 987b8912..dcc823b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,3 +88,7 @@ repos: language: system types_or: [python, pyi] require_serial: true + exclude: | + (?x)( + ^nvitop-exporter/setup.py$ + ) diff --git a/.pylintrc b/.pylintrc index ee0583f4..8a0628a0 100644 --- a/.pylintrc +++ b/.pylintrc @@ -421,7 +421,8 @@ confidence=HIGH, # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=consider-using-f-string, - duplicate-code + duplicate-code, + wrong-import-order # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/CHANGELOG.md b/CHANGELOG.md index eb4a8afc..39e5027f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add Prometheus exporter by [@XuehaiPan](https://github.com/XuehaiPan) in [#92](https://github.com/XuehaiPan/nvitop/pull/92). - Add device APIs to query PCIe and NVLink throughput by [@XuehaiPan](https://github.com/XuehaiPan) in [#87](https://github.com/XuehaiPan/nvitop/pull/87). ### Changed diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 106df4f1..e87a36d4 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -151,3 +151,5 @@ tx rx ThroughputInfo pytorch +api +utils diff --git a/nvitop-exporter/LICENSE b/nvitop-exporter/LICENSE new file mode 100644 index 00000000..1fcc34a3 --- /dev/null +++ b/nvitop-exporter/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021-2023 Xuehai Pan. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/nvitop-exporter/MANIFEST.in b/nvitop-exporter/MANIFEST.in new file mode 100644 index 00000000..1aba38f6 --- /dev/null +++ b/nvitop-exporter/MANIFEST.in @@ -0,0 +1 @@ +include LICENSE diff --git a/nvitop-exporter/README.md b/nvitop-exporter/README.md new file mode 100644 index 00000000..3599f625 --- /dev/null +++ b/nvitop-exporter/README.md @@ -0,0 +1,11 @@ +# nvitop-exporter + +Prometheus exporter built on top of `nvitop`. + +## Installation + +Install from PyPI: + +```bash +pip3 install --upgrade nvitop-exporter +``` diff --git a/nvitop-exporter/nvitop_exporter/__init__.py b/nvitop-exporter/nvitop_exporter/__init__.py new file mode 100644 index 00000000..67ddb819 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/__init__.py @@ -0,0 +1,24 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from nvitop_exporter.exporter import PrometheusExporter +from nvitop_exporter.utils import get_ip_address +from nvitop_exporter.version import __version__ + + +__all__ = ['PrometheusExporter', 'get_ip_address'] diff --git a/nvitop-exporter/nvitop_exporter/__main__.py b/nvitop-exporter/nvitop_exporter/__main__.py new file mode 100644 index 00000000..9c76a7f5 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/__main__.py @@ -0,0 +1,25 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +import sys + +from nvitop_exporter.cli import main + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/nvitop-exporter/nvitop_exporter/cli.py b/nvitop-exporter/nvitop_exporter/cli.py new file mode 100644 index 00000000..f493c8e1 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/cli.py @@ -0,0 +1,240 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from __future__ import annotations + +import argparse +import sys +from typing import TextIO + +from prometheus_client import start_wsgi_server + +import nvitop +from nvitop import Device, colored, libnvml +from nvitop_exporter.exporter import PrometheusExporter +from nvitop_exporter.utils import get_ip_address +from nvitop_exporter.version import __version__ + + +def cprint(text: str = '', *, file: TextIO | None = None) -> None: + """Print colored text to a file.""" + for prefix, color in ( + ('INFO: ', 'yellow'), + ('WARNING: ', 'yellow'), + ('ERROR: ', 'red'), + ('NVML ERROR: ', 'red'), + ): + if text.startswith(prefix): + text = text.replace( + prefix.rstrip(), + colored(prefix.rstrip(), color=color, attrs=('bold',)), + 1, + ) + print(text, file=file) + + +def parse_arguments() -> argparse.Namespace: + """Parse command-line arguments for ``nvitop-exporter``.""" + + def posfloat(argstring: str) -> float: + num = float(argstring) + if num <= 0: + raise ValueError + return num + + posfloat.__name__ = 'positive float' + + parser = argparse.ArgumentParser( + prog='nvitop-exporter', + description='Prometheus exporter built on top of `nvitop`.', + formatter_class=argparse.RawTextHelpFormatter, + add_help=False, + ) + parser.add_argument( + '--help', + '-h', + dest='help', + action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.', + ) + parser.add_argument( + '--version', + '-V', + dest='version', + action='version', + version=f'%(prog)s {__version__} (nvitop {nvitop.__version__})', + help="Show %(prog)s's version number and exit.", + ) + + parser.add_argument( + '--hostname', + '--host', + '-H', + dest='hostname', + type=str, + default=get_ip_address(), + metavar='HOSTNAME', + help='Hostname to display in the exporter. (default: %(default)s)', + ) + parser.add_argument( + '--bind-address', + '--bind', + '-B', + dest='bind_address', + type=str, + default='127.0.0.1', + metavar='ADDRESS', + help='Local address to bind to. (default: %(default)s)', + ) + parser.add_argument( + '--port', + '-p', + type=int, + default=8000, + help='Port to listen on. (default: %(default)d)', + ) + parser.add_argument( + '--interval', + dest='interval', + type=posfloat, + default=1.0, + metavar='SEC', + help='Interval between updates in seconds. (default: %(default)s)', + ) + + args = parser.parse_args() + if args.interval < 0.25: + parser.error( + f'the interval {args.interval:0.2g}s is too short, which may cause performance issues. ' + f'Expected 1/4 or higher.', + ) + + return args + + +def main() -> int: # pylint: disable=too-many-locals,too-many-statements + """Main function for ``nvitop-exporter`` CLI.""" + args = parse_arguments() + + try: + device_count = Device.count() + except libnvml.NVMLError_LibraryNotFound: + return 1 + except libnvml.NVMLError as ex: + cprint(f'NVML ERROR: {ex}', file=sys.stderr) + return 1 + + if device_count == 0: + cprint('NVML ERROR: No NVIDIA devices found.', file=sys.stderr) + return 1 + + physical_devices = Device.from_indices(range(device_count)) + mig_devices = [] + for device in physical_devices: + mig_devices.extend(device.mig_devices()) + cprint( + 'INFO: Found {}{}.'.format( + colored(str(device_count), color='green', attrs=('bold',)), + ( + ' physical device(s) and {} MIG device(s)'.format( + colored(str(len(mig_devices)), color='blue', attrs=('bold',)), + ) + if mig_devices + else ' device(s)' + ), + ), + file=sys.stderr, + ) + + devices = sorted( + physical_devices + mig_devices, # type: ignore[operator] + key=lambda d: (d.index,) if isinstance(d.index, int) else d.index, + ) + for device in devices: + name = device.name() + uuid = device.uuid() + if device.is_mig_device(): + name = name.rpartition(' ')[-1] + cprint( + f'INFO: MIG {name:<11} Device {device.mig_index:>2d}: (UUID: {uuid})', + file=sys.stderr, + ) + else: + cprint(f'INFO: GPU {device.index}: {name} (UUID: {uuid})', file=sys.stderr) + + exporter = PrometheusExporter(devices, hostname=args.hostname, interval=args.interval) + + try: + start_wsgi_server(port=args.port, addr=args.bind_address) + except OSError as ex: + if 'address already in use' in str(ex).lower(): + cprint( + ( + 'ERROR: Address {} is already in use. ' + 'Please specify a different port via `--port `.' + ).format( + colored( + f'http://{args.bind_address}:{args.port}', + color='blue', + attrs=('bold', 'underline'), + ), + ), + file=sys.stderr, + ) + elif 'cannot assign requested address' in str(ex).lower(): + cprint( + ( + 'ERROR: Cannot assign requested address at {}. ' + 'Please specify a different address via `--bind-address
`.' + ).format( + colored( + f'http://{args.bind_address}:{args.port}', + color='blue', + attrs=('bold', 'underline'), + ), + ), + file=sys.stderr, + ) + else: + cprint(f'ERROR: {ex}', file=sys.stderr) + return 1 + + cprint( + 'INFO: Start the exporter on {} at {}.'.format( + colored(args.hostname, color='magenta', attrs=('bold',)), + colored( + f'http://{args.bind_address}:{args.port}/metrics', + color='green', + attrs=('bold', 'underline'), + ), + ), + file=sys.stderr, + ) + + try: + exporter.collect() + except KeyboardInterrupt: + cprint(file=sys.stderr) + cprint('INFO: Interrupted by user.', file=sys.stderr) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/nvitop-exporter/nvitop_exporter/exporter.py b/nvitop-exporter/nvitop_exporter/exporter.py new file mode 100644 index 00000000..10e56785 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/exporter.py @@ -0,0 +1,608 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +from __future__ import annotations + +import math +import time +from typing import Sequence + +from prometheus_client import REGISTRY, CollectorRegistry, Gauge, Info + +from nvitop import Device, MiB, MigDevice, PhysicalDevice, host +from nvitop.api.process import GpuProcess +from nvitop_exporter.utils import get_ip_address + + +class PrometheusExporter: # pylint: disable=too-many-instance-attributes + """Prometheus exporter built on top of ``nvitop``.""" + + def __init__( # pylint: disable=too-many-statements + self, + devices: Sequence[Device], + hostname: str | None = None, + *, + registry: CollectorRegistry = REGISTRY, + interval: float = 1.0, + ) -> None: + """Initialize the Prometheus exporter.""" + if not isinstance(devices, (list, tuple)): + raise TypeError(f'Expected a list or tuple of devices, got {type(devices)}') + devices = list(devices) + + for device in devices: + if not isinstance(device, (PhysicalDevice, MigDevice)): + raise TypeError(f'Expected a PhysicalDevice or MigDevice, got {type(device)}') + + self.devices = devices + self.hostname = hostname or get_ip_address() + self.registry = registry + self.interval = interval + + self.info = Info( + 'nvitop', + documentation='NVITOP.', + labelnames=['hostname'], + registry=self.registry, + ) + self.info.labels(hostname=self.hostname).info( + { + 'device_count': str(Device.count()), + 'driver_version': Device.driver_version(), + 'cuda_driver_version': Device.cuda_driver_version(), + }, + ) + + # Create gauges for host metrics + self.host_uptime = Gauge( + name='host_uptime', + documentation='Host uptime (s).', + unit='Second', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_cpu_percent = Gauge( + name='host_cpu_percent', + documentation='Host CPU percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_total = Gauge( + name='host_virtual_memory_total', + documentation='Host virtual memory total (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_used = Gauge( + name='host_virtual_memory_used', + documentation='Host virtual memory used (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_free = Gauge( + name='host_virtual_memory_free', + documentation='Host virtual memory free (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_virtual_memory_percent = Gauge( + name='host_virtual_memory_percent', + documentation='Host virtual memory percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_total = Gauge( + name='host_swap_memory_total', + documentation='Host swap total (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_used = Gauge( + name='host_swap_memory_used', + documentation='Host swap used (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_free = Gauge( + name='host_swap_memory_free', + documentation='Host swap free (MiB).', + unit='MiB', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_swap_memory_percent = Gauge( + name='host_swap_memory_percent', + documentation='Host swap percent (%).', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_1m = Gauge( + name='host_load_average_1m', + documentation='Host load average for the last minute.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_5m = Gauge( + name='host_load_average_5m', + documentation='Host load average for the last 5 minutes.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_load_average_15m = Gauge( + name='host_load_average_15m', + documentation='Host load average for the last 15 minutes.', + unit='Percentage', + labelnames=['hostname'], + registry=self.registry, + ) + self.host_net_io_tx_data = Gauge( + name='host_net_io_tx_data', + documentation='Host network I/O transmitted data (MiB).', + unit='MiB', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_rx_data = Gauge( + name='host_net_io_rx_data', + documentation='Host network I/O received data (MiB).', + unit='MiB', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_tx_packets = Gauge( + name='host_net_io_tx_packets', + documentation='Host network I/O transmitted packets.', + unit='Packet', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_net_io_rx_packets = Gauge( + name='host_net_io_rx_packets', + documentation='Host network I/O received packets.', + unit='Packet', + labelnames=['hostname', 'interface'], + registry=self.registry, + ) + self.host_disk_io_read_data = Gauge( + name='host_disk_io_read_data', + documentation='Host disk I/O read data (MiB).', + unit='MiB', + labelnames=['hostname', 'partition'], + registry=self.registry, + ) + self.host_disk_io_write_data = Gauge( + name='host_disk_io_write_data', + documentation='Host disk I/O write data (MiB).', + unit='MiB', + labelnames=['hostname', 'partition'], + registry=self.registry, + ) + self.host_disk_usage_total = Gauge( + name='host_disk_usage_total', + documentation='Host disk usage total (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_used = Gauge( + name='host_disk_usage_used', + documentation='Host disk usage used (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_free = Gauge( + name='host_disk_usage_free', + documentation='Host disk usage free (MiB).', + unit='MiB', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + self.host_disk_usage_percent = Gauge( + name='host_disk_usage_percent', + documentation='Host disk usage percent (%).', + unit='Percentage', + labelnames=['hostname', 'mountpoint'], + registry=self.registry, + ) + + # Create gauges for GPU metrics + self.gpu_utilization = Gauge( + name='gpu_utilization', + documentation='GPU utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_utilization = Gauge( + name='gpu_memory_utilization', + documentation='GPU memory utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_encoder_utilization = Gauge( + name='gpu_encoder_utilization', + documentation='GPU encoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_decoder_utilization = Gauge( + name='gpu_decoder_utilization', + documentation='GPU decoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_total = Gauge( + name='gpu_memory_total', + documentation='GPU memory total (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_used = Gauge( + name='gpu_memory_used', + documentation='GPU memory used (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_free = Gauge( + name='gpu_memory_free', + documentation='GPU memory free (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_memory_percent = Gauge( + name='gpu_memory_percent', + documentation='GPU memory percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_sm = Gauge( + name='gpu_clock_sm', + documentation='GPU SM clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_memory = Gauge( + name='gpu_clock_memory', + documentation='GPU memory clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_graphics = Gauge( + name='gpu_clock_graphics', + documentation='GPU graphics clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_clock_video = Gauge( + name='gpu_clock_video', + documentation='GPU video clock (MHz).', + unit='MHz', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_power_usage = Gauge( + name='gpu_power_usage', + documentation='GPU power usage (W).', + unit='W', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_power_limit = Gauge( + name='gpu_power_limit', + documentation='GPU power limit (W).', + unit='W', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_temperature = Gauge( + name='gpu_temperature', + documentation='GPU temperature (C).', + unit='C', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_fan_speed = Gauge( + name='gpu_fan_speed', + documentation='GPU fan speed (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_pcie_tx_throughput = Gauge( + name='gpu_pcie_tx_throughput', + documentation='GPU PCIe transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_pcie_rx_throughput = Gauge( + name='gpu_pcie_rx_throughput', + documentation='GPU PCIe receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_mean_tx_throughput = Gauge( + name='gpu_nvlink_mean_tx_throughput', + documentation='GPU mean NVLink transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_mean_rx_throughput = Gauge( + name='gpu_nvlink_mean_rx_throughput', + documentation='GPU mean NVLink receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid'], + registry=self.registry, + ) + self.gpu_nvlink_tx_throughput = Gauge( + name='gpu_nvlink_tx_throughput', + documentation='GPU NVLink transmit throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], + registry=self.registry, + ) + self.gpu_nvlink_rx_throughput = Gauge( + name='gpu_nvlink_rx_throughput', + documentation='GPU NVLink receive throughput (MiB/s).', + unit='MiBps', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'link'], + registry=self.registry, + ) + + # Create gauges for process metrics + self.process_running_time = Gauge( + name='process_running_time', + documentation='Process running time (s).', + unit='Second', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_cpu_percent = Gauge( + name='process_cpu_percent', + documentation='Process CPU percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_rss_memory = Gauge( + name='process_rss_memory', + documentation='Process memory resident set size (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_memory_percent = Gauge( + name='process_memory_percent', + documentation='Process memory percent (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_memory = Gauge( + name='process_gpu_memory', + documentation='Process GPU memory (MiB).', + unit='MiB', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_sm_utilization = Gauge( + name='process_gpu_sm_utilization', + documentation='Process GPU SM utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_memory_utilization = Gauge( + name='process_gpu_memory_utilization', + documentation='Process GPU memory utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_encoder_utilization = Gauge( + name='process_gpu_encoder_utilization', + documentation='Process GPU encoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + self.process_gpu_decoder_utilization = Gauge( + name='process_gpu_decoder_utilization', + documentation='Process GPU decoder utilization (%).', + unit='Percentage', + labelnames=['hostname', 'index', 'devicename', 'uuid', 'pid', 'username'], + registry=self.registry, + ) + + def collect(self) -> None: + """Collect metrics.""" + while True: + next_update_time = time.monotonic() + self.interval + self.update_host() + for device in self.devices: + self.update_device(device) + time.sleep(max(0.0, next_update_time - time.monotonic())) + + def update_host(self) -> None: + """Update metrics for the host.""" + load_average = host.load_average() + if load_average is None: + load_average = (0.0, 0.0, 0.0) # type: ignore[unreachable] + virtual_memory = host.virtual_memory() + swap_memory = host.swap_memory() + net_io_counters = host.net_io_counters(pernic=True) # type: ignore[attr-defined] + disk_io_counters = host.disk_io_counters(perdisk=True) # type: ignore[attr-defined] + + for gauge, value in ( + (self.host_uptime, host.uptime()), + (self.host_cpu_percent, host.cpu_percent()), + (self.host_virtual_memory_total, virtual_memory.total / MiB), + (self.host_virtual_memory_used, virtual_memory.used / MiB), + (self.host_virtual_memory_free, virtual_memory.free / MiB), + (self.host_virtual_memory_percent, virtual_memory.percent), + (self.host_swap_memory_total, swap_memory.total / MiB), + (self.host_swap_memory_used, swap_memory.used / MiB), + (self.host_swap_memory_free, swap_memory.free / MiB), + (self.host_swap_memory_percent, swap_memory.percent), + (self.host_load_average_1m, load_average[0]), + (self.host_load_average_5m, load_average[1]), + (self.host_load_average_15m, load_average[2]), + ): + gauge.labels(self.hostname).set(value) + + for interface, net_io_counter in net_io_counters.items(): + for gauge, value in ( + (self.host_net_io_tx_data, net_io_counter.bytes_sent / MiB), + (self.host_net_io_rx_data, net_io_counter.bytes_recv / MiB), + (self.host_net_io_tx_packets, net_io_counter.packets_sent), + (self.host_net_io_rx_packets, net_io_counter.packets_recv), + ): + gauge.labels(hostname=self.hostname, interface=interface).set(value) + + for partition, disk_io_counter in disk_io_counters.items(): + for gauge, value in ( + (self.host_disk_io_read_data, disk_io_counter.read_bytes / MiB), + (self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB), + ): + gauge.labels(hostname=self.hostname, partition=partition).set(value) + for partition in host.disk_partitions(): # type: ignore[attr-defined] + try: + partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined] + except (OSError, host.PsutilError): + continue + for gauge, value in ( + (self.host_disk_usage_total, partition_usage.total / MiB), + (self.host_disk_usage_used, partition_usage.used / MiB), + (self.host_disk_usage_free, partition_usage.free / MiB), + (self.host_disk_usage_percent, partition_usage.percent), + ): + gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value) + + def update_device(self, device: Device) -> None: + """Update metrics for a single device.""" + index = ( + str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index)) + ) + name = device.name() + uuid = device.uuid() + + with device.oneshot(): + for gauge, value in ( + (self.gpu_utilization, float(device.gpu_utilization())), + (self.gpu_memory_utilization, float(device.memory_utilization())), + (self.gpu_encoder_utilization, float(device.encoder_utilization())), + (self.gpu_decoder_utilization, float(device.decoder_utilization())), + (self.gpu_memory_total, device.memory_total() / MiB), + (self.gpu_memory_used, device.memory_used() / MiB), + (self.gpu_memory_free, device.memory_free() / MiB), + (self.gpu_memory_percent, float(device.memory_percent())), + (self.gpu_clock_sm, float(device.clock_infos().sm)), + (self.gpu_clock_memory, float(device.clock_infos().memory)), + (self.gpu_clock_graphics, float(device.clock_infos().graphics)), + (self.gpu_clock_video, float(device.clock_infos().video)), + (self.gpu_power_usage, device.power_usage() / 1000.0), + (self.gpu_power_limit, device.power_limit() / 1000.0), + (self.gpu_temperature, float(device.temperature())), + (self.gpu_fan_speed, float(device.fan_speed())), + (self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0), + (self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0), + (self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0), + (self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0), + ): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + ).set(value) + + for gauge, nvlink_throughput in ( + (self.gpu_nvlink_tx_throughput, device.nvlink_tx_throughput()), + (self.gpu_nvlink_rx_throughput, device.nvlink_rx_throughput()), + ): + for link, throughput in enumerate(nvlink_throughput): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + link=link, + ).set(throughput / 1024.0) + + with GpuProcess.failsafe(): + for pid, process in device.processes().items(): + with process.oneshot(): + username = process.username() + running_time = process.running_time() + for gauge, value in ( + ( + self.process_running_time, + running_time.total_seconds() if running_time else math.nan, + ), + (self.process_cpu_percent, process.cpu_percent()), + (self.process_rss_memory, process.host_memory() / MiB), + (self.process_memory_percent, float(process.memory_percent())), + (self.process_gpu_memory, process.gpu_memory() / MiB), + ( + self.process_gpu_sm_utilization, + float(process.gpu_sm_utilization()), + ), + ( + self.process_gpu_memory_utilization, + float(process.gpu_memory_utilization()), + ), + ( + self.process_gpu_encoder_utilization, + float(process.gpu_encoder_utilization()), + ), + ( + self.process_gpu_decoder_utilization, + float(process.gpu_decoder_utilization()), + ), + ): + gauge.labels( + hostname=self.hostname, + index=index, + devicename=name, + uuid=uuid, + pid=pid, + username=username, + ).set(value) diff --git a/nvitop-exporter/nvitop_exporter/utils.py b/nvitop-exporter/nvitop_exporter/utils.py new file mode 100644 index 00000000..1b07fdb6 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/utils.py @@ -0,0 +1,38 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utility functions for ``nvitop-exporter``.""" + +import socket + + +__all__ = ['get_ip_address'] + + +# Reference: https://stackoverflow.com/a/28950776 +def get_ip_address() -> str: + """Get the IP address of the current machine.""" + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0.0) + try: + # Doesn't even have to be reachable + s.connect(('10.254.254.254', 1)) + ip_address = s.getsockname()[0] + except Exception: # noqa: BLE001 # pylint: disable=broad-except + ip_address = '127.0.0.1' + finally: + s.close() + return ip_address diff --git a/nvitop-exporter/nvitop_exporter/version.py b/nvitop-exporter/nvitop_exporter/version.py new file mode 100644 index 00000000..c4617fb9 --- /dev/null +++ b/nvitop-exporter/nvitop_exporter/version.py @@ -0,0 +1,54 @@ +# This file is part of nvitop, the interactive NVIDIA-GPU process viewer. +# +# Copyright 2021-2023 Xuehai Pan. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Prometheus exporter built on top of ``nvitop``.""" + +__version__ = '1.3.0' +__license__ = 'Apache-2.0' +__author__ = __maintainer__ = 'Xuehai Pan' +__email__ = 'XuehaiPan@pku.edu.cn' +__release__ = False + +if not __release__: + import os + import subprocess + + try: + prefix, sep, suffix = ( + subprocess.check_output( + ['git', 'describe', '--abbrev=7'], # noqa: S603,S607 + cwd=os.path.dirname(os.path.abspath(__file__)), + stderr=subprocess.DEVNULL, + text=True, + ) + .strip() + .lstrip('v') + .replace('-', '.dev', 1) + .replace('-', '+', 1) + .partition('.dev') + ) + if sep: + version_prefix, dot, version_tail = prefix.rpartition('.') + prefix = f'{version_prefix}{dot}{int(version_tail) + 1}' + __version__ = sep.join((prefix, suffix)) + del version_prefix, dot, version_tail + else: + __version__ = prefix + del prefix, sep, suffix + except (OSError, subprocess.CalledProcessError): + pass + + del os, subprocess diff --git a/nvitop-exporter/pyproject.toml b/nvitop-exporter/pyproject.toml new file mode 100644 index 00000000..de828807 --- /dev/null +++ b/nvitop-exporter/pyproject.toml @@ -0,0 +1,83 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "nvitop-exporter" +description = "Prometheus exporter built on top of `nvitop`." +readme = "README.md" +requires-python = ">= 3.7" +authors = [{ name = "Xuehai Pan", email = "XuehaiPan@pku.edu.cn" }] +license = { text = "Apache License, Version 2.0 (Apache-2.0)" } +keywords = [ + "nvidia", + "nvidia-smi", + "NVIDIA", + "NVML", + "CUDA", + "GPU", + "top", + "monitoring", + "prometheus", + "Prometheus", + "grafana", + "Grafana", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Environment :: GPU", + "Environment :: GPU :: NVIDIA CUDA", + "Intended Audience :: Developers", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: System Administrators", + "Topic :: System :: Hardware", + "Topic :: System :: Monitoring", + "Topic :: System :: Systems Administration", + "Topic :: Utilities", +] +dependencies = [ + # Sync with nvitop/version.py and requirements.txt + "nvitop == 1.3.0", + "prometheus-client >= 0.4.0", +] +dynamic = ["version"] + +[project.scripts] +nvitop-exporter = "nvitop_exporter.cli:main" + +[project.urls] +Homepage = "https://github.com/XuehaiPan/nvitop" +Repository = "https://github.com/XuehaiPan/nvitop" +Documentation = "https://nvitop.readthedocs.io" +"Bug Report" = "https://github.com/XuehaiPan/nvitop/issues" + +[tool.setuptools.packages.find] +include = ["nvitop_exporter", "nvitop_exporter.*"] + +[tool.black] +safe = true +line-length = 100 +skip-string-normalization = true +target-version = ["py37", "py38", "py39", "py310", "py311"] + +[tool.isort] +atomic = true +profile = "black" +src_paths = ["nvitop_exporter"] +known_first_party = ["nvitop", "nvitop_exporter"] +indent = 4 +line_length = 100 +lines_after_imports = 2 +multi_line_output = 3 + +[tool.ruff] +extend = "../pyproject.toml" diff --git a/nvitop-exporter/requirements.txt b/nvitop-exporter/requirements.txt new file mode 100644 index 00000000..2b792ceb --- /dev/null +++ b/nvitop-exporter/requirements.txt @@ -0,0 +1,2 @@ +nvitop +prometheus-client >= 0.4.0 diff --git a/nvitop-exporter/setup.py b/nvitop-exporter/setup.py new file mode 100755 index 00000000..9104bc78 --- /dev/null +++ b/nvitop-exporter/setup.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +"""Setup script for ``nvitop-exporter``.""" + +import pathlib +import re +import sys + +from setuptools import setup + + +HERE = pathlib.Path(__file__).absolute().parent +VERSION_FILE = HERE / 'nvitop_exporter' / 'version.py' + +sys.path.insert(0, str(VERSION_FILE.parent)) +# pylint: disable-next=import-error,wrong-import-position +import version # noqa + + +VERSION_CONTENT = None + +try: + if not version.__release__: + try: + VERSION_CONTENT = VERSION_FILE.read_text(encoding='utf-8') + VERSION_FILE.write_text( + data=re.sub( + r"""__version__\s*=\s*('[^']+'|"[^"]+")""", + f'__version__ = {version.__version__!r}', + string=VERSION_CONTENT, + ), + encoding='utf-8', + ) + except OSError: + VERSION_CONTENT = None + + setup( + name='nvitop-exporter', + version=version.__version__, + ) +finally: + if VERSION_CONTENT is not None: + with VERSION_FILE.open(mode='wt', encoding='utf-8', newline='') as file: + file.write(VERSION_CONTENT) diff --git a/nvitop/api/__init__.py b/nvitop/api/__init__.py index 25227c29..fd4e814a 100644 --- a/nvitop/api/__init__.py +++ b/nvitop/api/__init__.py @@ -29,18 +29,37 @@ ) from nvitop.api.libnvml import NVMLError, nvmlCheckReturn from nvitop.api.process import GpuProcess, HostProcess, command_join -from nvitop.api.utils import * # noqa: F403 +from nvitop.api.utils import ( # explicitly export these to appease mypy + NA, + SIZE_UNITS, + UINT_MAX, + ULONGLONG_MAX, + GiB, + KiB, + MiB, + NaType, + NotApplicable, + NotApplicableType, + PiB, + Snapshot, + TiB, + boolify, + bytes2human, + colored, + human2bytes, + set_color, + timedelta2human, + utilization2string, +) __all__ = [ - 'take_snapshots', - 'collect_in_background', - 'ResourceMetricCollector', - 'libnvml', - 'nvmlCheckReturn', 'NVMLError', + 'nvmlCheckReturn', + 'libnvml', 'libcuda', 'libcudart', + # nvitop.api.device 'Device', 'PhysicalDevice', 'MigDevice', @@ -48,9 +67,34 @@ 'CudaMigDevice', 'parse_cuda_visible_devices', 'normalize_cuda_visible_devices', + # nvitop.api.process 'host', 'HostProcess', 'GpuProcess', 'command_join', - *utils.__all__, + # nvitop.api.collector + 'take_snapshots', + 'collect_in_background', + 'ResourceMetricCollector', + # nvitop.api.utils + 'NA', + 'NaType', + 'NotApplicable', + 'NotApplicableType', + 'UINT_MAX', + 'ULONGLONG_MAX', + 'KiB', + 'MiB', + 'GiB', + 'TiB', + 'PiB', + 'SIZE_UNITS', + 'bytes2human', + 'human2bytes', + 'timedelta2human', + 'utilization2string', + 'colored', + 'set_color', + 'boolify', + 'Snapshot', ] diff --git a/nvitop/api/device.py b/nvitop/api/device.py index 497d8b5d..2cf50c4b 100644 --- a/nvitop/api/device.py +++ b/nvitop/api/device.py @@ -1154,7 +1154,7 @@ def gpu_utilization(self) -> int | NaType: # in percentage gpu_percent = gpu_utilization # in percentage - def memory_utilization(self) -> float | NaType: # in percentage + def memory_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. @@ -1170,7 +1170,7 @@ def memory_utilization(self) -> float | NaType: # in percentage """ # pylint: disable=line-too-long return self.utilization_rates().memory - def encoder_utilization(self) -> float | NaType: # in percentage + def encoder_utilization(self) -> int | NaType: # in percentage """The encoder utilization rate in percentage. Returns: Union[int, NaType] @@ -1178,7 +1178,7 @@ def encoder_utilization(self) -> float | NaType: # in percentage """ return self.utilization_rates().encoder - def decoder_utilization(self) -> float | NaType: # in percentage\ + def decoder_utilization(self) -> int | NaType: # in percentage """The decoder utilization rate in percentage. Returns: Union[int, NaType] @@ -2120,8 +2120,8 @@ def processes(self) -> dict[int, GpuProcess]: self.handle, # Only utilization samples that were recorded after this timestamp will be returned. # The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used. - # Here we use the timestamp 1/4 second ago to ensure the record buffer is not empty. - time.time_ns() // 1000 - 250_000, + # Here we use the timestamp 1 second ago to ensure the record buffer is not empty. + time.time_ns() // 1000 - 1000_000, default=(), ) for s in sorted(samples, key=lambda s: s.timeStamp): diff --git a/nvitop/api/utils.py b/nvitop/api/utils.py index 8b5b9090..d0ac276d 100644 --- a/nvitop/api/utils.py +++ b/nvitop/api/utils.py @@ -730,10 +730,11 @@ def memoize_when_activated(method: Method) -> Method: """ @functools.wraps(method) - def wrapped(self, *args, **kwargs): # noqa: ANN001,ANN002,ANN003,ANN202 + def wrapped(self: object, *args: Any, **kwargs: Any) -> Any: try: # case 1: we previously entered oneshot() ctx - ret = self._cache[method] # pylint: disable=protected-access + # pylint: disable-next=protected-access + ret = self._cache[method] # type: ignore[attr-defined] except AttributeError: # case 2: we never entered oneshot() ctx return method(self, *args, **kwargs) @@ -742,25 +743,28 @@ def wrapped(self, *args, **kwargs): # noqa: ANN001,ANN002,ANN003,ANN202 # for this entry yet ret = method(self, *args, **kwargs) try: - self._cache[method] = ret # pylint: disable=protected-access + # pylint: disable-next=protected-access + self._cache[method] = ret # type: ignore[attr-defined] except AttributeError: # multi-threading race condition, see: # https://github.com/giampaolo/psutil/issues/1948 pass return ret - def cache_activate(self): # noqa: ANN001,ANN202 + def cache_activate(self: object) -> None: """Activate cache. Expects an instance. Cache will be stored as a "_cache" instance attribute. """ if not hasattr(self, '_cache'): - self._cache = {} # pylint: disable=protected-access + # pylint: disable-next=protected-access + self._cache = {} # type: ignore[attr-defined] - def cache_deactivate(self): # noqa: ANN001,ANN202 + def cache_deactivate(self: object) -> None: """Deactivate and clear cache.""" try: - del self._cache # pylint: disable=protected-access + # pylint: disable-next=protected-access + del self._cache # type: ignore[attr-defined] except AttributeError: pass diff --git a/nvitop/cli.py b/nvitop/cli.py index bbb959db..61d856e0 100644 --- a/nvitop/cli.py +++ b/nvitop/cli.py @@ -24,7 +24,7 @@ # pylint: disable=too-many-branches,too-many-statements def parse_arguments() -> argparse.Namespace: - """Parse command-line arguments for ``nvtiop``.""" + """Parse command-line arguments for ``nvitop``.""" coloring_rules = '{} < th1 %% <= {} < th2 %% <= {}'.format( colored('light', 'green'), colored('moderate', 'yellow'), diff --git a/nvitop/version.py b/nvitop/version.py index 49231509..51b6c39f 100644 --- a/nvitop/version.py +++ b/nvitop/version.py @@ -17,7 +17,7 @@ """An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management.""" __version__ = '1.2.0' -__license__ = 'GPLv3' +__license__ = 'GPL-3.0-only AND Apache-2.0' __author__ = __maintainer__ = 'Xuehai Pan' __email__ = 'XuehaiPan@pku.edu.cn' __release__ = False diff --git a/pyproject.toml b/pyproject.toml index f7eec77a..9dd8f964 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,8 @@ target-version = ["py37", "py38", "py39", "py310", "py311"] [tool.isort] atomic = true profile = "black" -src_paths = ["nvitop"] +src_paths = ["nvitop", "nvitop-exporter/nvitop_exporter"] +known_first_party = ["nvitop", "nvitop_exporter"] indent = 4 line_length = 100 lines_after_imports = 2 @@ -85,14 +86,16 @@ multi_line_output = 3 [tool.mypy] # Sync with requires-python python_version = 3.8 # appease mypy for syntax errors in numpy stubs +mypy_path = [".", "nvitop-exporter"] +exclude = ["nvitop-exporter/setup.py"] pretty = true show_error_codes = true show_error_context = true show_traceback = true allow_redefinition = true check_untyped_defs = true -disallow_incomplete_defs = false -disallow_untyped_defs = false +disallow_incomplete_defs = true +disallow_untyped_defs = true ignore_missing_imports = true no_implicit_optional = true strict_equality = true @@ -119,7 +122,7 @@ ignore-words = "docs/source/spelling_wordlist.txt" target-version = "py37" line-length = 100 show-source = true -src = ["nvitop"] +src = ["nvitop", "nvitop-exporter/nvitop_exporter"] select = [ "E", "W", # pycodestyle "F", # pyflakes