Skip to content

Commit

Permalink
Update ai_setup_nvidia_cuda for ilab > v0.17 (#8830)
Browse files Browse the repository at this point in the history
  • Loading branch information
tonykay authored Nov 15, 2024
1 parent 2870379 commit 3fce44b
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 71 deletions.
42 changes: 9 additions & 33 deletions ansible/roles/ai_setup_nvidia_cuda/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,56 +3,32 @@

# Common vars

ai_setup_nvidia_cuda_python_version: '3.11'
ai_setup_nvidia_cuda_cuda_version: '12-4'
ai_setup_nvidia_cuda_debug: false
setup_nvidia_cuda_python_version: '3.11'
setup_nvidia_cuda_cuda_version: '12.4'
setup_nvidia_cuda_debug: false

ai_setup_nvidia_cuda_rhel_repos:
setup_nvidia_cuda_common_dnf_packages:

- name: CUDA
description: CUDA Repository
baseurl: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
enabled: true
gpgcheck: false

ai_setup_nvidia_cuda_common_dnf_packages:

- "python{{ ai_setup_nvidia_cuda_python_version }}"
- "python{{ ai_setup_nvidia_cuda_python_version }}-devel"
- "python{{ ai_setup_nvidia_cuda_python_version }}-pip"
- "python{{ setup_nvidia_cuda_python_version }}"
- "python{{ setup_nvidia_cuda_python_version }}-devel"
- "python{{ setup_nvidia_cuda_python_version }}-pip"
- pciutils
- nvtop
- screen
- tmux
- hyperfine

# RHEL Vars

ai_setup_nvidia_cuda_nvidia_rhel_dnf_packages:
setup_nvidia_cuda_nvidia_rhel_dnf_packages:

- "@nvidia-driver:latest-dkms"
- cuda-toolkit
- nvidia-gds

ai_setup_nvidia_cuda_rhel_repos:

# - name: epel-release-latest-9.noarch
# description: EPEL Repository
# baseurl: https://dl.fedoraproject.org/pub/epel
# enabled: true
# gpgcheck: false
setup_nvidia_cuda_rhel_repos:

- name: cuda-rhel-x86_64
description: NVIDIA CUDA Repository
baseurl: https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64
enabled: true
gpgcheck: false

# Nvidia Vars

ai_setup_nvidia_cuda_fedora_version: 39

ai_setup_nvidia_cuda_nvidia_fedora_dnf_packages:

- "@nvidia-driver:open-dkms"
- cuda-toolkit-12-4
61 changes: 23 additions & 38 deletions ansible/roles/ai_setup_nvidia_cuda/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,64 +17,49 @@
baseurl: "{{ repo.baseurl }}"
enabled: "{{ repo.enabled | default(true) }}"
gpgcheck: "{{ repo.gpgcheck | default(false) }}"
loop: "{{ ai_setup_nvidia_cuda_rhel_repos }}"
loop: "{{ setup_nvidia_cuda_rhel_repos }}"
loop_control:
loop_var: repo

- name: Install nvdia drivers and CUDA
ansible.builtin.dnf:
name: "{{ package }}"
state: present
loop: "{{ ai_setup_nvidia_cuda_nvidia_rhel_dnf_packages }}"
loop: "{{ setup_nvidia_cuda_nvidia_rhel_dnf_packages }}"
loop_control:
loop_var: package

- name: Setup Nvidia Drivers and CUDA for Fedora
when: ansible_distribution == 'Fedora'
block:
- name: Setup alternatives properly
block:

- name: Add a DNF repository
ansible.builtin.yum_repository:
name: cuda-fedora39
description: NVIDIA CUDA Repository
baseurl: https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64
enabled: true
gpgcheck: false
- name: Set CUDA alternatives/cuda
community.general.alternatives:
name: cuda
path: "/usr/local/cuda-{{ setup_nvidia_cuda_cuda_version }}"
link: /etc/alternatives/cuda

# - name: Remove the cuda-12 alternatve that gets misconfigured
# ansible.builtin.file:
# path: /etc/alternatives/cuda-12
# state: absent
#
# - name: Set CUDA alternatives/cuda-12
# community.general.alternatives:
# name: cuda-12
# path: "/usr/local/cuda-{{ setup_nvidia_cuda_cuda_version }}"
# link: /etc/alternatives/cuda-12

- name: Setup nvdia repo, drivers, and cuda
ansible.builtin.dnf:
name: "{{ package }}"
state: present
loop: "{{ ai_setup_nvidia_cuda_nvidia_fedora_dnf_packages }}"
loop_control:
loop_var: package

- name: Debug - Setup Nvidia Drivers and CUDA
when: ai_setup_nvidia_cuda_debug | default(false) | bool
block:

- name: Check video driver
ansible.builtin.shell: "lspci -nn -k | grep -A 2 -e VGA -e 3D"
register: r_video_driver_check
changed_when: false
ignore_errors: true

- name: Output video driver check
ansible.builtin.debug:
var: r_video_driver_check.stdout_lines

# Common tasks to RHEL and Fedora
tags:
- nvidia-alternatives

- name: Install common AI centric toolchain packages
ansible.builtin.dnf:
name: "{{ package }}"
state: present
loop: "{{ ai_setup_nvidia_cuda_common_dnf_packages }}"
loop: "{{ setup_nvidia_cuda_common_dnf_packages }}"
loop_control:
loop_var: package

# TODO: Need to add a check here to see if the video driver is in use, if not, reboot the machine

# TODO: Is reboot really necessary here? Investigate

# - name: Reboot the machine
Expand Down

0 comments on commit 3fce44b

Please sign in to comment.