Skip to content

Commit

Permalink
Merge pull request #365 from singnet/angelo/#355/metta-tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
angeloprobst authored Nov 14, 2024
2 parents 91ef077 + daf00ff commit 6a41583
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 0 deletions.
37 changes: 37 additions & 0 deletions extra/cpp/tokenizers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
cmake_minimum_required(VERSION 3.10)
project(Tokenizers)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Add GoogleTest
include(FetchContent)
FetchContent_Declare(
googletest
DOWNLOAD_EXTRACT_TIMESTAMP ON
URL https://github.com/google/googletest/archive/refs/tags/v1.15.2.zip)
FetchContent_MakeAvailable(googletest)

enable_testing()

add_library(tokenizers STATIC
src/metta_tokenizer.cc)

include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/include
${GTEST_INCLUDE_DIRS})

add_executable(
test_metta_tokenizer
tests/test_metta_tokenizer.cc)

target_link_libraries(
test_metta_tokenizer
tokenizers
gtest_main
${GTEST_LIBRARIES}
pthread)

include(GoogleTest)
gtest_discover_tests(test_metta_tokenizer)
10 changes: 10 additions & 0 deletions extra/cpp/tokenizers/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
clean:
@rm -rf ./build

build-tests: clean
@mkdir -p ./build \
&& cmake -S . -B ./build \
&& cmake --build ./build --parallel $(nproc)

unit-tests: build-tests
make -C ./build test
30 changes: 30 additions & 0 deletions extra/cpp/tokenizers/include/metta_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

#include <string>

using namespace std;

// -------------------------------------------------------------------------------------------------
/**
* @brief Parses a MeTTa expression into a tokenized string stream.
*
* This function processes the input MeTTa expression string and converts it into a tokenized string
* stream. The expression is expected to be in the format `(Similarity (Concept "human") $v1)`, where
* elements inside the parentheses are links of type `Expression`. Each element inside the
* parentheses, such as `Similarity`, `Concept`, and `"human"`, are nodes of type `Symbol`,
* except for those that start with `$`, which are variables.
*
* Example:
*
* Input: `(Similarity (Concept "human") $v1)`
*
* Output: `LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol "human" VARIABLE v1`
*
* @param expression The input MeTTa expression string to be tokenized.
* @return A tokenized string stream representing the parsed expression.
* @throws runtime_error if the expression is invalid.
*/
string tokenize(const string& expression);

// -------------------------------------------------------------------------------------------------

15 changes: 15 additions & 0 deletions extra/cpp/tokenizers/main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include <iostream>

#include "metta_tokenizer.h"

using namespace std;

// -------------------------------------------------------------------------------------------------
int main() {
string expression = "(Similarity (Concept \"human\") $v1)";
auto tokenized = tokenize(expression);
cout << tokenize(expression) << endl;
return 0;
}

// -------------------------------------------------------------------------------------------------
79 changes: 79 additions & 0 deletions extra/cpp/tokenizers/src/metta_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include <stdexcept>
#include <tuple>
#include <vector>

#include "metta_tokenizer.h"

using namespace std;

// -------------------------------------------------------------------------------------------------
/**
* @brief Parses a MeTTa expression into a tokenized string stream.
*
* This function processes the input MeTTa expression string starting from the given cursor position
* and returns a pair containing the updated cursor position and the tokenized string stream.
*
* @param expression The input MeTTa expression string to be tokenized.
* @param cursor The starting position in the expression string. Defaults to 0.
* @return A pair containing the updated cursor position and the tokenized string stream.
* @throws runtime_error if the expression is invalid.
*/
pair<size_t, string> _tokenize(const string& expression, size_t cursor = 0) {
string output;
string header = "LINK Expression";
int target_count = 0;
string token;
char ch;
size_t start = cursor;

for (; cursor < expression.size(); cursor++) {
ch = expression[cursor];

if (ch == '(') {
if (cursor > start) {
tie(cursor, token) = _tokenize(expression, cursor);
output += " " + token;
target_count++;
}
continue;

} else if (ch == ')') {
return make_pair(cursor, header + " " + to_string(target_count) + output);

} else if (isspace(ch)) {
continue;

} else {
token.clear();
while (
cursor < expression.size()
and not isspace(expression[cursor])
and expression[cursor] != '('
and expression[cursor] != ')'
) {
token += expression[cursor++];
}
--cursor;

if (token[0] == '$') {
header = "LINK_TEMPLATE Expression";
output += " VARIABLE " + token.substr(1);
target_count++;
} else {
output += " NODE Symbol " + token;
target_count++;
}
}
}

throw runtime_error("Invalid expression");
}

// -------------------------------------------------------------------------------------------------
string tokenize(const string& expression) {
auto [_, tokenized_query] = _tokenize(expression);
return tokenized_query;
}

// -------------------------------------------------------------------------------------------------

30 changes: 30 additions & 0 deletions extra/cpp/tokenizers/tests/test_metta_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <gtest/gtest.h>

#include "metta_tokenizer.h"

TEST(MettaTokenizerTest, BasicAssertions) {
string expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
string expression = "(Similarity (Concept \"human\") $v1)";
string actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity VARIABLE v1 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\"";
expression = "(Similarity $v1 (Concept \"human\"))";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 4 NODE Symbol Similarity VARIABLE v0 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
expression = "(Similarity $v0 (Concept \"human\") $v1)";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK_TEMPLATE Expression 2 NODE Symbol Concept VARIABLE v0 VARIABLE v1";
expression = "(Similarity (Concept $v0) $v1)";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

0 comments on commit 6a41583

Please sign in to comment.