-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
6 changed files
with
201 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
cmake_minimum_required(VERSION 3.10) | ||
project(Tokenizers) | ||
|
||
set(CMAKE_CXX_STANDARD 20) | ||
set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
|
||
# Add GoogleTest | ||
include(FetchContent) | ||
FetchContent_Declare( | ||
googletest | ||
DOWNLOAD_EXTRACT_TIMESTAMP ON | ||
URL https://github.com/google/googletest/archive/refs/tags/v1.15.2.zip) | ||
FetchContent_MakeAvailable(googletest) | ||
|
||
enable_testing() | ||
|
||
add_library(tokenizers STATIC | ||
src/metta_tokenizer.cc) | ||
|
||
include_directories( | ||
${CMAKE_CURRENT_SOURCE_DIR} | ||
${CMAKE_CURRENT_SOURCE_DIR}/include | ||
${GTEST_INCLUDE_DIRS}) | ||
|
||
add_executable( | ||
test_metta_tokenizer | ||
tests/test_metta_tokenizer.cc) | ||
|
||
target_link_libraries( | ||
test_metta_tokenizer | ||
tokenizers | ||
gtest_main | ||
${GTEST_LIBRARIES} | ||
pthread) | ||
|
||
include(GoogleTest) | ||
gtest_discover_tests(test_metta_tokenizer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
clean: | ||
@rm -rf ./build | ||
|
||
build-tests: clean | ||
@mkdir -p ./build \ | ||
&& cmake -S . -B ./build \ | ||
&& cmake --build ./build --parallel $(nproc) | ||
|
||
unit-tests: build-tests | ||
make -C ./build test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#pragma once | ||
|
||
#include <string> | ||
|
||
using namespace std; | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
/** | ||
* @brief Parses a MeTTa expression into a tokenized string stream. | ||
* | ||
* This function processes the input MeTTa expression string and converts it into a tokenized string | ||
* stream. The expression is expected to be in the format `(Similarity (Concept "human") $v1)`, where | ||
* elements inside the parentheses are links of type `Expression`. Each element inside the | ||
* parentheses, such as `Similarity`, `Concept`, and `"human"`, are nodes of type `Symbol`, | ||
* except for those that start with `$`, which are variables. | ||
* | ||
* Example: | ||
* | ||
* Input: `(Similarity (Concept "human") $v1)` | ||
* | ||
* Output: `LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol "human" VARIABLE v1` | ||
* | ||
* @param expression The input MeTTa expression string to be tokenized. | ||
* @return A tokenized string stream representing the parsed expression. | ||
* @throws runtime_error if the expression is invalid. | ||
*/ | ||
string tokenize(const string& expression); | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#include <iostream> | ||
|
||
#include "metta_tokenizer.h" | ||
|
||
using namespace std; | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
int main() { | ||
string expression = "(Similarity (Concept \"human\") $v1)"; | ||
auto tokenized = tokenize(expression); | ||
cout << tokenize(expression) << endl; | ||
return 0; | ||
} | ||
|
||
// ------------------------------------------------------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#include <stdexcept> | ||
#include <tuple> | ||
#include <vector> | ||
|
||
#include "metta_tokenizer.h" | ||
|
||
using namespace std; | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
/** | ||
* @brief Parses a MeTTa expression into a tokenized string stream. | ||
* | ||
* This function processes the input MeTTa expression string starting from the given cursor position | ||
* and returns a pair containing the updated cursor position and the tokenized string stream. | ||
* | ||
* @param expression The input MeTTa expression string to be tokenized. | ||
* @param cursor The starting position in the expression string. Defaults to 0. | ||
* @return A pair containing the updated cursor position and the tokenized string stream. | ||
* @throws runtime_error if the expression is invalid. | ||
*/ | ||
pair<size_t, string> _tokenize(const string& expression, size_t cursor = 0) { | ||
string output; | ||
string header = "LINK Expression"; | ||
int target_count = 0; | ||
string token; | ||
char ch; | ||
size_t start = cursor; | ||
|
||
for (; cursor < expression.size(); cursor++) { | ||
ch = expression[cursor]; | ||
|
||
if (ch == '(') { | ||
if (cursor > start) { | ||
tie(cursor, token) = _tokenize(expression, cursor); | ||
output += " " + token; | ||
target_count++; | ||
} | ||
continue; | ||
|
||
} else if (ch == ')') { | ||
return make_pair(cursor, header + " " + to_string(target_count) + output); | ||
|
||
} else if (isspace(ch)) { | ||
continue; | ||
|
||
} else { | ||
token.clear(); | ||
while ( | ||
cursor < expression.size() | ||
and not isspace(expression[cursor]) | ||
and expression[cursor] != '(' | ||
and expression[cursor] != ')' | ||
) { | ||
token += expression[cursor++]; | ||
} | ||
--cursor; | ||
|
||
if (token[0] == '$') { | ||
header = "LINK_TEMPLATE Expression"; | ||
output += " VARIABLE " + token.substr(1); | ||
target_count++; | ||
} else { | ||
output += " NODE Symbol " + token; | ||
target_count++; | ||
} | ||
} | ||
} | ||
|
||
throw runtime_error("Invalid expression"); | ||
} | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
string tokenize(const string& expression) { | ||
auto [_, tokenized_query] = _tokenize(expression); | ||
return tokenized_query; | ||
} | ||
|
||
// ------------------------------------------------------------------------------------------------- | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#include <gtest/gtest.h> | ||
|
||
#include "metta_tokenizer.h" | ||
|
||
TEST(MettaTokenizerTest, BasicAssertions) { | ||
string expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1"; | ||
string expression = "(Similarity (Concept \"human\") $v1)"; | ||
string actual = tokenize(expression); | ||
EXPECT_EQ(actual, expected); | ||
|
||
expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity VARIABLE v1 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\""; | ||
expression = "(Similarity $v1 (Concept \"human\"))"; | ||
actual = tokenize(expression); | ||
EXPECT_EQ(actual, expected); | ||
|
||
expected = "LINK_TEMPLATE Expression 4 NODE Symbol Similarity VARIABLE v0 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1"; | ||
expression = "(Similarity $v0 (Concept \"human\") $v1)"; | ||
actual = tokenize(expression); | ||
EXPECT_EQ(actual, expected); | ||
|
||
expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK_TEMPLATE Expression 2 NODE Symbol Concept VARIABLE v0 VARIABLE v1"; | ||
expression = "(Similarity (Concept $v0) $v1)"; | ||
actual = tokenize(expression); | ||
EXPECT_EQ(actual, expected); | ||
} | ||
|
||
int main(int argc, char **argv) { | ||
::testing::InitGoogleTest(&argc, argv); | ||
return RUN_ALL_TESTS(); | ||
} |