Music Copilot.pyscript

from typing import Any
import flpianoroll as flp
from dataclasses import dataclass
import re
import subprocess
import json
import time
from abc import ABC, abstractmethod


def print(*values):
    string = " ".join(map(str, values))
    for line in string.splitlines():
        flp.Utils.log(line)


def ticks_to_beats_str(ticks):
    def gcd(a, b):
        while b:
            a, b = b, a % b
        return a

    def reduce_fraction(n, d):
        g = gcd(n, d)
        return n // g, d // g

    # Convert to a fraction
    # Beat = ticks / PPQ / 4 * tsden = ticks * tsden / (PPQ * 4)
    # numerator = ticks * flp.score.tsden
    # denominator = flp.score.PPQ * 4
    # TODO: Note that tsden is not reliable. Therefore we should not use it. Just treat beats as always quarter notes.
    numerator = ticks
    denominator = flp.score.PPQ
    numerator, denominator = reduce_fraction(numerator, denominator)

    # Convert to string
    if denominator == 1:
        return str(numerator)
    else:
        return f"{numerator}/{denominator}"


def beats_str_to_ticks(beats_str):
    if "/" in beats_str:
        numerator, denominator = map(int, beats_str.split("/"))
        # return numerator * flp.score.PPQ * 4 // (denominator * flp.score.tsden)
        return numerator * flp.score.PPQ // denominator
    else:
        # return int(beats_str) * flp.score.PPQ * 4 // flp.score.tsden
        return int(beats_str) * flp.score.PPQ


def ticks_to_beats(ticks):
    return ticks / flp.score.PPQ


def beats_to_ticks(beats):
    # return beats * flp.score.PPQ * 4 // flp.score.tsden
    return int(beats * flp.score.PPQ)


NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
NOTE_ALIASES = {"Db": "C#", "Eb": "D#", "Gb": "F#", "Ab": "G#", "Bb": "A#"}

NOTE_INDEX_MAPPING = {note_name: index for index, note_name in enumerate(NOTE_NAMES)}
NOTE_INDEX_MAPPING.update(
    {alias: NOTE_INDEX_MAPPING[note_name] for alias, note_name in NOTE_ALIASES.items()}
)


def note_number_to_str(note_number):
    octave = note_number // 12
    note_index = note_number % 12
    note_name = NOTE_NAMES[note_index]
    return f"{note_name}{octave}"


def str_to_note_number(note_str):
    note_str = note_str.capitalize()

    if len(note_str) > 2 and note_str[1] in {"b", "#"}:
        note_name = note_str[:2]
        octave = int(note_str[2:])
    else:
        note_name = note_str[0]
        octave = int(note_str[1:])

    if note_name in NOTE_ALIASES:
        note_name = NOTE_ALIASES[note_name]

    note_index = NOTE_INDEX_MAPPING[note_name]
    return octave * 12 + note_index


@staticmethod
def format_beats(beats: float):
    return f"{beats:.3f}".rstrip("0").rstrip(".")


@dataclass
class Note:
    time: int  # In terms of ticks
    length: int  # In terms of ticks
    number: int
    selected: bool = False

    @staticmethod
    def from_flp_note(note: flp.Note):
        return Note(
            time=note.time,
            length=note.length,
            number=note.number,
            selected=note.selected,
        )

    def to_flp_note(self):
        note = flp.Note()
        note.time = self.time
        note.length = self.length
        note.number = self.number
        note.selected = self.selected
        return note

    def __str__(self):
        return f"Note(time={ticks_to_beats_str(self.time)}, length={ticks_to_beats_str(self.length)}, number={note_number_to_str(self.number)}, selected={self.selected})"

    def __repr__(self):
        return str(self)


class ScorePromptEncoder(ABC):
    @abstractmethod
    def encode(
        self,
        notes: list[Note],
        timeline_selection_start_ticks: int | None,
        timeline_selection_end_ticks: int | None,
    ) -> list[str]:
        pass

    @abstractmethod
    def decode(
        self,
        encoded_strings: list[str],
        start_time_ticks: int,
        end_time_ticks: int | None,
    ):
        pass

    @abstractmethod
    def is_timeline_selection_start(self, encoded_token: str) -> bool:
        pass

    @abstractmethod
    def is_timeline_selection_end(self, encoded_token: str) -> bool:
        pass

    @abstractmethod
    def is_selected_note(self, encoded_token: str) -> bool:
        pass

    @abstractmethod
    def get_system_prompt(
        self, has_timeline_selection: bool, has_selected_notes: bool
    ) -> str:
        pass

    def stringify_encoded_strings(
        self, encoded_strings: list[str], max_characters: int
    ) -> str:
        left_bracket_index = None
        right_bracket_index = None
        first_selected_note_index = None
        last_selected_note_index = None

        # Step 1: Identify key indices
        for i, token in enumerate(encoded_strings):
            if self.is_timeline_selection_start(token):
                if left_bracket_index is not None:
                    raise ValueError("Multiple timeline selection start found")
                left_bracket_index = i
            elif self.is_timeline_selection_end(token):
                if right_bracket_index is not None:
                    raise ValueError("Multiple timeline selection end found")
                if left_bracket_index is None:
                    raise ValueError("Timeline selection end found before start")
                right_bracket_index = i
            elif self.is_selected_note(token):
                if first_selected_note_index is None:
                    first_selected_note_index = i
                last_selected_note_index = i

        if (left_bracket_index is None) != (right_bracket_index is None):
            raise ValueError("Only one of timeline selection start/end found")

        # Determine the must-include range
        if left_bracket_index is not None:
            must_include_start = left_bracket_index
            must_include_end = right_bracket_index
        elif first_selected_note_index is not None:
            must_include_start = first_selected_note_index
            must_include_end = last_selected_note_index
        else:
            must_include_start = must_include_end = None

        # Initialize pointers for expansion
        start_index = (
            must_include_start
            if must_include_start is not None
            else len(encoded_strings)
        )
        end_index = (
            must_include_end
            if must_include_end is not None
            else len(encoded_strings) - 1
        )

        # Calculate length of the must-include section
        current_length = sum(
            len(encoded_strings[i]) for i in range(start_index, end_index + 1)
        )
        current_length += max(0, end_index - start_index) * 1  # Account for spaces

        if current_length > max_characters:
            if left_bracket_index is not None:
                raise ValueError("Timeline selection contains too many notes")

            if first_selected_note_index is not None:
                raise ValueError("Too many notes selected")

            raise ValueError("This should not happen")

        # Expand to include as many tokens as possible
        while start_index > 0 or end_index < len(encoded_strings) - 1:
            added = False
            # Try adding left
            if start_index > 0:
                potential_add_left = (
                    len(encoded_strings[start_index - 1]) + 1
                )  # Include space
                if current_length + potential_add_left <= max_characters:
                    current_length += potential_add_left
                    start_index -= 1
                    added = True

            # Try adding right
            if end_index < len(encoded_strings) - 1:
                potential_add_right = (
                    len(encoded_strings[end_index + 1]) + 1
                )  # Include space
                if current_length + potential_add_right <= max_characters:
                    current_length += potential_add_right
                    end_index += 1
                    added = True

            if not added:
                break

        assert current_length <= max_characters

        return " ".join(encoded_strings[start_index : end_index + 1])


class ScorePromptEncoderV2(ScorePromptEncoder):
    def encode(
        self,
        notes: list[Note],
        timeline_selection_start_ticks: int | None,
        timeline_selection_end_ticks: int | None,
    ):
        assert (timeline_selection_start_ticks is None) == (
            timeline_selection_end_ticks is None
        )
        notes = notes[:]
        timeline_selection_start_dummy_note = (
            Note(timeline_selection_start_ticks, 0, -1)
            if timeline_selection_start_ticks is not None
            else None
        )
        timeline_selection_end_dummy_note = (
            Note(timeline_selection_end_ticks, 0, -1)
            if timeline_selection_end_ticks is not None
            else None
        )
        if timeline_selection_start_dummy_note is not None:
            notes.append(timeline_selection_start_dummy_note)

        if timeline_selection_end_dummy_note is not None:
            notes.append(timeline_selection_end_dummy_note)

        sorted_notes = sorted(notes, key=lambda note: (note.time, note.number))

        encoded_list = []

        for note in sorted_notes:
            if timeline_selection_start_dummy_note is note:
                encoded_list.append(f"{format_beats(ticks_to_beats(note.time))}-[")
                continue

            if timeline_selection_end_dummy_note is note:
                encoded_list.append(f"{format_beats(ticks_to_beats(note.time))}-]")
                continue

            # Add the note representation
            note_str = note_number_to_str(note.number)
            encoded_list.append(
                f"{format_beats(ticks_to_beats(note.time))}-{note_str}-{format_beats(ticks_to_beats(note.length))}{'*' if note.selected else ''}"
            )

        return encoded_list

    def decode(
        self,
        encoded_strings: list[str],
        start_time_ticks: int,
        end_time_ticks: int | None,
    ):
        # Define regex patterns for matching notes and time increases
        note_pattern = re.compile(r"([\d.]+)-([A-G][#b]?[0-9])-([\d.]+)\*?")

        notes: list[Note] = []

        for part in encoded_strings:
            # Check for note representation
            note_match = note_pattern.match(part)
            if note_match:
                time_beats = float(note_match.group(1))
                note_str = note_match.group(2)
                length_beats = float(note_match.group(3))

                midi_number = str_to_note_number(note_str)
                time_ticks = beats_to_ticks(time_beats)
                length_ticks = beats_to_ticks(length_beats)

                note = Note(time_ticks, length_ticks, midi_number)
                # if end_time_ticks is not None:
                #     # Trim or skip notes that exceed the end time
                #     if note.time >= end_time_ticks:
                #         break

                #     if note.time + note.length > end_time_ticks:
                #         note.length = end_time_ticks - note.time

                notes.append(note)

        return sorted(notes, key=lambda note: note.time)

    def is_timeline_selection_start(self, encoded_token: str) -> bool:
        return encoded_token.endswith("-[")

    def is_timeline_selection_end(self, encoded_token: str) -> bool:
        return encoded_token.endswith("-]")

    def is_selected_note(self, encoded_token: str) -> bool:
        return encoded_token.endswith("*")

    def get_system_prompt(
        self, has_timeline_selection: bool, has_selected_notes: bool
    ) -> str:
        instructions = [
            """- The score is a space separated string of notes.""",
            """- Each note is represented by a string in the format "<time>-<note>-<length>", where <time> is the start time of the note in beats (floating point, up to 0.001 precision), <note> is a string representing the note name and octave (e.g. "C4", "A#5"), and <length> is a string representing the length of the note in beats (floating point, up to 0.001 precision).""",
            """- All note times are absolute, including your output notes.""",
            """- Example: 2-C4-0.5""",
        ]
        if has_timeline_selection:
            instructions.append(
                """- Timeline selection region is represented by "<time>-[" and "<time>-]", where <time> is the start/end time of the timeline selection in beats (floating point, up to 0.001 precision)."""
            )
        if has_selected_notes:
            instructions.append(
                """- A selected note has a * at the end of the string (e.g. "1-C4-0.5*")."""
            )
        return "\n".join(instructions)


def write_note(note):
    flp.score.addNote(note.to_flp_note())


GPT_MODELS = [
    "gpt-4-0125-preview",
    "gpt-4-1106-preview",
    "gpt-4-turbo-preview",
    "gpt-4-0613",
    "gpt-4",
    "gpt-3.5-turbo-0125",
]


with open("openai_api_key.txt", "r") as file:
    OPENAI_API_KEY = file.read().strip()


def query_llm(api_key, prompt, model):
    data = {
        "prompt": prompt,
        "api_key": api_key,
        "model": model,
    }
    json_string = json.dumps(data)
    command = ["python", "music_copilot_helper.py", json_string]
    process: Any = subprocess.Popen(
        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
    )
    i = 0
    while process.poll() is None:
        flp.Utils.ProgressMsg("Generating...", i, 10)
        time.sleep(0.25)
        i = (i + 1) % 10

    output_data = process.stdout.read()
    response = output_data.decode("utf-8-sig").strip().splitlines()
    if response[0] != "SUCCESS":
        raise Exception("\n".join(response[1:]))

    return "\n".join(response[1:])


MAX_PROMPT_LENGTH = 7500
MAX_PROMPT_LENGTH_WITHOUT_NOTES = MAX_PROMPT_LENGTH - 500


@dataclass
class ScoreCopilotResult:
    new_notes: list[Note]
    remove_selected_notes: bool


class ScoreCopilot(ABC):
    @abstractmethod
    def get_description(self) -> str:
        pass

    @abstractmethod
    def run(
        self,
        model_to_use: str,
        user_prompt_text: str,
        sorted_notes: list[Note],
        start_time_ticks: int,
        end_time_ticks: int | None,
        has_timeline_selection: bool,
        has_selected_notes: bool,
    ) -> ScoreCopilotResult:
        pass


class ScoreCopilotChat(ScoreCopilot):
    NO_SELECTED_NOTES_ADDITIONAL_PROMPT = """
You can only add notes. Output the notes you would like to add to the score.
Your output must begin with "BEGIN_NOTES" followed by a single line containing the added notes in copilot format, such as:
```
BEGIN_NOTES
(notes to add in copilot format)
```
""".strip()
    SELECTED_NOTES_ADDITIONAL_PROMPT = """
Depending if you are modifying existing notes, your output must begin with either "ADD_NOTES" (keep selected notes) or "REPLACE_SELECTED_NOTES" (remove selected notes automatically) followed by a single line containing the added notes in copilot format, such as:
```
ADD_NOTES
(notes to add in copilot format)
```
""".strip()
    PROMPT_TEMPLATE = """
You are music copilot, a talented music assistant that helps the user with music production in the piano roll.
You work with a score format called "copilot" that represents notes in a musical score:
{encoder_system_prompt}

Here are the existing score in copilot format:
{{notes}}

The user's instruction is: {user_prompt}

{additional_prompt}
Do not output anything else. Only notes as described above.
""".strip()

    def __init__(self, score_prompt_encoder: ScorePromptEncoder):
        self.score_prompt_encoder = score_prompt_encoder

    def get_description(self):
        return """WARNING: PREVIEW MUST BE DISABLED. This script uses language model to generate new notes or edit selected notes based on the provided prompt. Note that if any notes are selected, the model can only see the selected notes (due to FL Studio restriction)."""

    @classmethod
    def build_prompt(
        cls,
        user_prompt_text: str,
        sorted_notes: list[Note],
        timeline_selection_start: int | None,
        timeline_selection_end: int | None,
        has_selected_notes: bool,
    ):
        assert (timeline_selection_start is None) == (timeline_selection_end is None)
        has_timeline_selection = timeline_selection_start is not None

        additional_prompts = []
        if has_selected_notes:
            additional_prompts.append(cls.SELECTED_NOTES_ADDITIONAL_PROMPT)

        else:
            additional_prompts.append(cls.NO_SELECTED_NOTES_ADDITIONAL_PROMPT)

        template = cls.PROMPT_TEMPLATE.format(
            user_prompt=user_prompt_text,
            additional_prompt="\n".join(additional_prompts),
            encoder_system_prompt=score_prompt_encoder.get_system_prompt(
                has_timeline_selection,
                has_selected_notes,
            ),
        )
        if len(template) > MAX_PROMPT_LENGTH_WITHOUT_NOTES:
            raise Exception(
                f"Prompt length is too long. Please reduce user prompt length."
            )

        notes_str = score_prompt_encoder.stringify_encoded_strings(
            score_prompt_encoder.encode(
                sorted_notes, timeline_selection_start, timeline_selection_end
            ),
            MAX_PROMPT_LENGTH - len(template),
        )
        return template.format(notes=notes_str)

    def run(
        self,
        model_to_use: str,
        user_prompt_text: str,
        sorted_notes: list[Note],
        start_time_ticks: int,
        end_time_ticks: int | None,
        has_timeline_selection: bool,
        has_selected_notes: bool,
    ):
        model_prompt = ScoreCopilotChat.build_prompt(
            user_prompt_text,
            sorted_notes,
            start_time_ticks if has_timeline_selection else None,
            end_time_ticks if has_timeline_selection else None,
            has_selected_notes,
        )
        print("===== Prompt =====")
        print(model_prompt)
        model_response = query_llm(OPENAI_API_KEY, model_prompt, model_to_use)
        print("===== Response =====")
        print(model_response)
        lines = model_response.splitlines()

        remove_selected_notes = False

        notes_str = None
        for i in range(len(lines) - 1):
            if lines[i].strip() == "BEGIN_NOTES" or lines[i].strip() == "ADD_NOTES" or lines[i].strip() == "REPLACE_SELECTED_NOTES":
                if lines[i].strip() == "REPLACE_SELECTED_NOTES":
                    remove_selected_notes = True

                notes_str = lines[i + 1].strip()
                break

        if notes_str is None:
            raise Exception("No notes found in the model response. Please try again.")

        new_notes = score_prompt_encoder.decode(
            notes_str.split(), start_time_ticks, end_time_ticks
        )

        return ScoreCopilotResult(new_notes, remove_selected_notes)


score_prompt_encoder = ScorePromptEncoderV2()

score_copilot = ScoreCopilotChat(score_prompt_encoder)

sorted_notes = sorted(
    [Note.from_flp_note(flp.score.getNote(i)) for i in range(flp.score.noteCount)],
    key=lambda note: (note.time, note.number),
)


def createDialog():
    dialog = flp.ScriptDialog("Copilot", score_copilot.get_description())
    dialog.AddInputText("Prompt", "")
    dialog.AddInputCombo("GPT Model", ",".join(GPT_MODELS), 0)
    # dialog.AddInputCheckbox("Replace Selected", True)
    return dialog


def apply(form):
    user_prompt_text = form.GetInputValue("Prompt")
    if user_prompt_text.strip() == "":
        flp.Utils.ShowMessage(
            "Please provide a prompt, and make sure preview is disabled."
        )
        return

    model_to_use = GPT_MODELS[form.GetInputValue("GPT Model")]
    timeline_selection = flp.score.getTimelineSelection()
    has_timeline_selection = timeline_selection[0] >= 0 and timeline_selection[1] >= 0
    first_selected_note = next((note for note in sorted_notes if note.selected), None)
    last_selected_note = next(
        (note for note in reversed(sorted_notes) if note.selected), None
    )
    assert (first_selected_note is None) == (last_selected_note is None)
    has_selected_notes = first_selected_note is not None
    if has_timeline_selection:
        start_time_ticks, end_time_ticks = timeline_selection
    else:
        if len(sorted_notes) == 0:
            start_time_ticks = 0
            end_time_ticks = None
        else:
            if first_selected_note is not None and last_selected_note is not None:
                start_time_ticks = first_selected_note.time
                end_time_ticks = last_selected_note.time + last_selected_note.length
            else:
                start_time_ticks = sorted_notes[-1].time + sorted_notes[-1].length
                end_time_ticks = None

    result = score_copilot.run(
        model_to_use,
        user_prompt_text,
        sorted_notes,
        start_time_ticks,
        end_time_ticks,
        has_timeline_selection,
        has_selected_notes,
    )

    if result.remove_selected_notes:
        flp.score.clearNotes()

    for note in result.new_notes:
        write_note(note)