Merge pull request #365 from singnet/angelo/#355/metta-tokenizer

singnet · Nov 14, 2024 · 6a41583 · 6a41583
2 parents 91ef077 + daf00ff
commit 6a41583
Show file tree

Hide file tree

Showing 6 changed files with 201 additions and 0 deletions.
diff --git a/extra/cpp/tokenizers/CMakeLists.txt b/extra/cpp/tokenizers/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.10)
+project(Tokenizers)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Add GoogleTest
+include(FetchContent)
+FetchContent_Declare(
+   googletest
+   DOWNLOAD_EXTRACT_TIMESTAMP ON
+   URL https://github.com/google/googletest/archive/refs/tags/v1.15.2.zip)
+FetchContent_MakeAvailable(googletest)
+
+enable_testing()
+
+add_library(tokenizers STATIC
+    src/metta_tokenizer.cc)
+
+include_directories(
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${GTEST_INCLUDE_DIRS})
+
+add_executable(
+    test_metta_tokenizer
+    tests/test_metta_tokenizer.cc)
+
+target_link_libraries(
+    test_metta_tokenizer
+    tokenizers
+    gtest_main
+    ${GTEST_LIBRARIES}
+    pthread)
+
+include(GoogleTest)
+gtest_discover_tests(test_metta_tokenizer)
diff --git a/extra/cpp/tokenizers/Makefile b/extra/cpp/tokenizers/Makefile
@@ -0,0 +1,10 @@
+clean:
+	@rm -rf ./build
+
+build-tests: clean
+	@mkdir -p ./build \
+		&& cmake -S . -B ./build \
+		&& cmake --build ./build --parallel $(nproc)
+
+unit-tests: build-tests
+	make -C ./build test
diff --git a/extra/cpp/tokenizers/include/metta_tokenizer.h b/extra/cpp/tokenizers/include/metta_tokenizer.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <string>
+
+using namespace std;
+
+// -------------------------------------------------------------------------------------------------
+/**
+ * @brief Parses a MeTTa expression into a tokenized string stream.
+ *
+ * This function processes the input MeTTa expression string and converts it into a tokenized string
+ * stream. The expression is expected to be in the format `(Similarity (Concept "human") $v1)`, where
+ * elements inside the parentheses are links of type `Expression`. Each element inside the
+ * parentheses, such as `Similarity`, `Concept`, and `"human"`, are nodes of type `Symbol`,
+ * except for those that start with `$`, which are variables.
+ *
+ * Example:
+ * 
+ * Input: `(Similarity (Concept "human") $v1)`
+ * 
+ * Output: `LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol "human" VARIABLE v1`
+ *
+ * @param expression The input MeTTa expression string to be tokenized.
+ * @return A tokenized string stream representing the parsed expression.
+ * @throws runtime_error if the expression is invalid.
+ */
+string tokenize(const string& expression);
+
+// -------------------------------------------------------------------------------------------------
+
diff --git a/extra/cpp/tokenizers/main.cc b/extra/cpp/tokenizers/main.cc
@@ -0,0 +1,15 @@
+#include <iostream>
+
+#include "metta_tokenizer.h"
+
+using namespace std;
+
+// -------------------------------------------------------------------------------------------------
+int main() {
+    string expression = "(Similarity (Concept \"human\") $v1)";
+    auto tokenized = tokenize(expression);
+    cout << tokenize(expression) << endl;
+    return 0;
+}
+
+// -------------------------------------------------------------------------------------------------
diff --git a/extra/cpp/tokenizers/src/metta_tokenizer.cc b/extra/cpp/tokenizers/src/metta_tokenizer.cc
@@ -0,0 +1,79 @@
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include "metta_tokenizer.h"
+
+using namespace std;
+
+// -------------------------------------------------------------------------------------------------
+/**
+ * @brief Parses a MeTTa expression into a tokenized string stream.
+ *
+ * This function processes the input MeTTa expression string starting from the given cursor position
+ * and returns a pair containing the updated cursor position and the tokenized string stream.
+ *
+ * @param expression The input MeTTa expression string to be tokenized.
+ * @param cursor The starting position in the expression string. Defaults to 0.
+ * @return A pair containing the updated cursor position and the tokenized string stream.
+ * @throws runtime_error if the expression is invalid.
+ */
+pair<size_t, string> _tokenize(const string& expression, size_t cursor = 0) {
+    string output;
+    string header = "LINK Expression";
+    int target_count = 0;
+    string token;
+    char ch;
+    size_t start = cursor;
+
+    for (; cursor < expression.size(); cursor++) {
+        ch = expression[cursor];
+
+        if (ch == '(') {
+            if (cursor > start) {
+                tie(cursor, token) = _tokenize(expression, cursor);
+                output += " " + token;
+                target_count++;
+            }
+            continue;
+
+        } else if (ch == ')') {
+            return make_pair(cursor, header + " " + to_string(target_count) + output);
+
+        } else if (isspace(ch)) {
+            continue;
+
+        } else {
+            token.clear();
+            while (
+                cursor < expression.size()
+                and not isspace(expression[cursor])
+                and expression[cursor] != '('
+                and expression[cursor] != ')'
+            ) {
+                token += expression[cursor++];
+            }
+            --cursor;
+
+            if (token[0] == '$') {
+                header = "LINK_TEMPLATE Expression";
+                output += " VARIABLE " + token.substr(1);
+                target_count++;
+            } else {
+                output += " NODE Symbol " + token;
+                target_count++;
+            }
+        }
+    }
+
+    throw runtime_error("Invalid expression");
+}
+
+// -------------------------------------------------------------------------------------------------
+string tokenize(const string& expression) {
+    auto [_, tokenized_query] = _tokenize(expression);
+    return tokenized_query;
+}
+
+// -------------------------------------------------------------------------------------------------
+
diff --git a/extra/cpp/tokenizers/tests/test_metta_tokenizer.cc b/extra/cpp/tokenizers/tests/test_metta_tokenizer.cc
@@ -0,0 +1,30 @@
+#include <gtest/gtest.h>
+
+#include "metta_tokenizer.h"
+
+TEST(MettaTokenizerTest, BasicAssertions) {
+    string expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
+    string expression = "(Similarity (Concept \"human\") $v1)";
+    string actual = tokenize(expression);
+    EXPECT_EQ(actual, expected);
+
+    expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity VARIABLE v1 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\"";
+    expression = "(Similarity $v1 (Concept \"human\"))";
+    actual = tokenize(expression);
+    EXPECT_EQ(actual, expected);
+
+    expected = "LINK_TEMPLATE Expression 4 NODE Symbol Similarity VARIABLE v0 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
+    expression = "(Similarity $v0 (Concept \"human\") $v1)";
+    actual = tokenize(expression);
+    EXPECT_EQ(actual, expected);
+
+    expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK_TEMPLATE Expression 2 NODE Symbol Concept VARIABLE v0 VARIABLE v1";
+    expression = "(Similarity (Concept $v0) $v1)";
+    actual = tokenize(expression);
+    EXPECT_EQ(actual, expected);
+}
+
+int main(int argc, char **argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}