Skip to content

Commit

Permalink
fix: refactor scanner to use utf8, windows is confusing me
Browse files Browse the repository at this point in the history
  • Loading branch information
liz3 committed Nov 5, 2024
1 parent a9d86db commit 4f856d5
Showing 1 changed file with 47 additions and 26 deletions.
73 changes: 47 additions & 26 deletions src/vm/scanner.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <string.h>
#include <uchar.h>

#include "common.h"
#include "utf8.h"
#include "scanner.h"

void initScanner(Scanner *scanner, const char *source) {
Expand All @@ -10,17 +12,17 @@ void initScanner(Scanner *scanner, const char *source) {
scanner->rawString = false;
}

static bool isAlpha(char c) {
static bool isAlpha(char32_t c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_';
}

static bool isDigit(char c) {
static bool isDigit(char32_t c) {
return c >= '0' && c <= '9';
}

static bool isHexDigit(char c) {
static bool isHexDigit(char32_t c) {
return ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') || (c == '_'));
}

Expand All @@ -29,23 +31,31 @@ static bool isAtEnd(Scanner *scanner) {
}

static char advance(Scanner *scanner) {
scanner->current++;
return scanner->current[-1];
utf8_int32_t current;
scanner->current = utf8codepoint(scanner->current, &current);
return current;
}

static char peek(Scanner *scanner) {
return *scanner->current;
utf8_int32_t current;
utf8codepoint(scanner->current, &current);
return current;
}

static char peekNext(Scanner *scanner) {
if (isAtEnd(scanner)) return '\0';
return scanner->current[1];
utf8_int32_t current;
const char* ptr = utf8codepoint(scanner->current, &current);
utf8codepoint(ptr, &current);
return current;
}

static bool match(Scanner *scanner, char expected) {
static bool match(Scanner *scanner, char32_t expected) {
if (isAtEnd(scanner)) return false;
if (*scanner->current != expected) return false;
scanner->current++;
utf8_int32_t current;
const char* ptr = utf8codepoint(scanner->current, &current);
if ((char32_t)current != expected) return false;
scanner->current = ptr;
return true;
}

Expand All @@ -70,7 +80,7 @@ static LangToken errorToken(Scanner *scanner, const char *message) {

static void skipWhitespace(Scanner *scanner) {
for (;;) {
char c = peek(scanner);
char32_t c = peek(scanner);
switch (c) {
case ' ':
case '\r':
Expand Down Expand Up @@ -134,11 +144,22 @@ static LangTokenType checkKeyword(Scanner *scanner, int start, int length,
return TOKEN_IDENTIFIER;
}

char32_t getCharacter(const char* ptr, int offset) {
utf8_int32_t current;

for(int i = 0; i < offset+1;i++) {
ptr = utf8codepoint(ptr, &current);

}
return current;
}

static LangTokenType identifierType(Scanner *scanner) {
switch (scanner->start[0]) {

switch (getCharacter(scanner->start, 0)) {
case 'a':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'b': {
return checkKeyword(scanner, 2, 6, "stract", TOKEN_ABSTRACT);
}
Expand All @@ -157,7 +178,7 @@ static LangTokenType identifierType(Scanner *scanner) {
return checkKeyword(scanner, 1, 4, "reak", TOKEN_BREAK);
case 'c':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'a':
return checkKeyword(scanner, 2, 2, "se", TOKEN_CASE);
case 'l':
Expand All @@ -166,7 +187,7 @@ static LangTokenType identifierType(Scanner *scanner) {
// Skip second char
// Skip third char
if (scanner->current - scanner->start > 3) {
switch (scanner->start[3]) {
switch (getCharacter(scanner->start, 3)) {
case 't':
return checkKeyword(scanner, 4, 4, "inue", TOKEN_CONTINUE);
case 's':
Expand All @@ -179,15 +200,15 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 'd':
if (scanner->current - scanner->start > 3) {
switch (scanner->start[3]) {
switch (getCharacter(scanner->start, 3)) {
case 'a':
return checkKeyword(scanner, 4, 3, "ult", TOKEN_DEFAULT);
}
}
return checkKeyword(scanner, 1, 2, "ef", TOKEN_DEF);
case 'e':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'l':
return checkKeyword(scanner, 2, 2, "se", TOKEN_ELSE);
case 'n':
Expand All @@ -197,7 +218,7 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 'f':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'a':
return checkKeyword(scanner, 2, 3, "lse", TOKEN_FALSE);
case 'o':
Expand All @@ -209,7 +230,7 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 'i':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'f':
return checkKeyword(scanner, 2, 0, "", TOKEN_IF);
case 'm':
Expand All @@ -219,7 +240,7 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 'n':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'o':
return checkKeyword(scanner, 2, 1, "t", TOKEN_NOT);
case 'i':
Expand All @@ -233,7 +254,7 @@ static LangTokenType identifierType(Scanner *scanner) {
return checkKeyword(scanner, 1, 6, "rivate", TOKEN_PRIVATE);
case 'r':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'e':
return checkKeyword(scanner, 2, 4, "turn", TOKEN_RETURN);
}
Expand All @@ -246,7 +267,7 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 's':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 't':
return checkKeyword(scanner, 2, 4, "atic", TOKEN_STATIC);
case 'u':
Expand All @@ -258,12 +279,12 @@ static LangTokenType identifierType(Scanner *scanner) {
break;
case 't':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'h':
return checkKeyword(scanner, 2, 2, "is", TOKEN_THIS);
case 'r':
if (scanner->current - scanner->start > 2) {
switch (scanner->start[2]) {
switch (getCharacter(scanner->start, 2)) {
case 'u':
return checkKeyword(scanner, 3, 1, "e", TOKEN_TRUE);
case 'a':
Expand All @@ -279,7 +300,7 @@ static LangTokenType identifierType(Scanner *scanner) {
return checkKeyword(scanner, 1, 2, "ar", TOKEN_VAR);
case 'w':
if (scanner->current - scanner->start > 1) {
switch (scanner->start[1]) {
switch (getCharacter(scanner->start, 1)) {
case 'h':
return checkKeyword(scanner, 2, 3, "ile", TOKEN_WHILE);
case 'i':
Expand Down Expand Up @@ -366,7 +387,7 @@ LangToken scanToken(Scanner *scanner) {

if (isAtEnd(scanner)) return makeToken(scanner, TOKEN_EOF);

char c = advance(scanner);
char32_t c = advance(scanner);

if (isAlpha(c)) return identifier(scanner);
if (isDigit(c)) return hexNumber(scanner);
Expand Down

0 comments on commit 4f856d5

Please sign in to comment.