Skip to content

Commit

Permalink
use flat memory layout
Browse files Browse the repository at this point in the history
  • Loading branch information
marler8997 committed Mar 6, 2021
1 parent 711981b commit a4ef7f9
Showing 1 changed file with 103 additions and 84 deletions.
187 changes: 103 additions & 84 deletions re.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,31 @@

/* Definitions: */

#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */
#define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */
#define MAX_REGEXP_LEN 70 /* Max number of bytes for a regex. */


enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ };

typedef struct regex_t
{
unsigned char type; /* CHAR, STAR, etc. */
union
{
unsigned char ch; /* the character itself */
unsigned char* ccl; /* OR a pointer to characters in class */
} u;
unsigned char type; /* CHAR, STAR, etc. */
unsigned char data_len;
unsigned char data[0];
} regex_t;

static re_t getnext(regex_t* pattern)
{
return (re_t)(((unsigned char*)pattern) + 2 + pattern->data_len);
}



/* Private function declarations: */
static int matchpattern(regex_t* pattern, const char* text, int* matchlength);
static int matchcharclass(char c, const char* str);
static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength);
static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength);
static int matchone(regex_t p, char c);
static int matchstar(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
static int matchplus(regex_t *p, regex_t* pattern, const char* text, int* matchlength);
static int matchone(regex_t* p, char c);
static int matchdigit(char c);
static int matchalpha(char c);
static int matchwhitespace(char c);
Expand All @@ -80,9 +81,9 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
*matchlength = 0;
if (pattern != 0)
{
if (pattern[0].type == BEGIN)
if (pattern->type == BEGIN)
{
return ((matchpattern(&pattern[1], text, matchlength)) ? 0 : -1);
return ((matchpattern(getnext(pattern), text, matchlength)) ? 0 : -1);
}
else
{
Expand All @@ -106,33 +107,37 @@ int re_matchp(re_t pattern, const char* text, int* matchlength)
return -1;
}

static int min(int a, int b)
{
return (a <= b) ? a : b;
}

re_t re_compile(const char* pattern)
{
/* The sizes of the two static arrays below substantiates the static RAM usage of this module.
MAX_REGEXP_OBJECTS is the max number of symbols in the expression.
MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */
static regex_t re_compiled[MAX_REGEXP_OBJECTS];
static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN];
int ccl_bufidx = 1;
/* The sizes of this static array substantiates the static RAM usage of this module.
MAX_REGEXP_LEN is the max number number of bytes in the expression. */
static unsigned char re_data[MAX_REGEXP_LEN];

char c; /* current char in pattern */
int i = 0; /* index into pattern */
int j = 0; /* index into re_compiled */
int j = 0; /* index into re_data */

while (pattern[i] != '\0' && (j+1 < MAX_REGEXP_OBJECTS))
while (pattern[i] != '\0' && (j+3 < MAX_REGEXP_LEN))
{
c = pattern[i];
regex_t *re_compiled = (regex_t*)(re_data+j);
re_compiled->data_len = 0;

switch (c)
{
/* Meta-characters: */
case '^': { re_compiled[j].type = BEGIN; } break;
case '$': { re_compiled[j].type = END; } break;
case '.': { re_compiled[j].type = DOT; } break;
case '*': { re_compiled[j].type = STAR; } break;
case '+': { re_compiled[j].type = PLUS; } break;
case '?': { re_compiled[j].type = QUESTIONMARK; } break;
/* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */
case '^': { re_compiled->type = BEGIN; } break;
case '$': { re_compiled->type = END; } break;
case '.': { re_compiled->type = DOT; } break;
case '*': { re_compiled->type = STAR; } break;
case '+': { re_compiled->type = PLUS; } break;
case '?': { re_compiled->type = QUESTIONMARK; } break;
/* case '|': { re_compiled->type = BRANCH; } break; <-- not working properly */

/* Escaped character-classes (\s \w ...): */
case '\\':
Expand All @@ -145,41 +150,46 @@ re_t re_compile(const char* pattern)
switch (pattern[i])
{
/* Meta-character: */
case 'd': { re_compiled[j].type = DIGIT; } break;
case 'D': { re_compiled[j].type = NOT_DIGIT; } break;
case 'w': { re_compiled[j].type = ALPHA; } break;
case 'W': { re_compiled[j].type = NOT_ALPHA; } break;
case 's': { re_compiled[j].type = WHITESPACE; } break;
case 'S': { re_compiled[j].type = NOT_WHITESPACE; } break;
case 'd': { re_compiled->type = DIGIT; } break;
case 'D': { re_compiled->type = NOT_DIGIT; } break;
case 'w': { re_compiled->type = ALPHA; } break;
case 'W': { re_compiled->type = NOT_ALPHA; } break;
case 's': { re_compiled->type = WHITESPACE; } break;
case 'S': { re_compiled->type = NOT_WHITESPACE; } break;

/* Escaped character, e.g. '.' or '$' */
default:
{
re_compiled[j].type = CHAR;
re_compiled[j].u.ch = pattern[i];
if (j + 4 >= MAX_REGEXP_LEN) {
//fputs("exceeded internal buffer!\n", stderr);
return 0;
}
re_compiled->type = CHAR;
re_compiled->data_len = 1;
re_compiled->data[0] = pattern[i];
} break;
}
}
/* '\\' as last char in pattern -> invalid regular expression. */
/*
else
{
re_compiled[j].type = CHAR;
re_compiled[j].ch = pattern[i];
re_compiled->type = CHAR;
re_compiled->data_len = 1;
re_compiled->data[0] = pattern[i];
}
*/
} break;

/* Character class: */
case '[':
{
/* Remember where the char-buffer starts. */
int buf_begin = ccl_bufidx;
int char_limit = min(0xff, MAX_REGEXP_LEN - j - 4); // 4 for this object and UNUSED at the minimum

/* Look-ahead to determine if negated */
if (pattern[i+1] == '^')
{
re_compiled[j].type = INV_CHAR_CLASS;
re_compiled->type = INV_CHAR_CLASS;
i += 1; /* Increment i to avoid including '^' in the char-buffer */
if (pattern[i+1] == 0) /* incomplete pattern, missing non-zero char after '^' */
{
Expand All @@ -188,7 +198,7 @@ re_t re_compile(const char* pattern)
}
else
{
re_compiled[j].type = CHAR_CLASS;
re_compiled->type = CHAR_CLASS;
}

/* Copy characters inside [..] to buffer */
Expand All @@ -197,7 +207,7 @@ re_t re_compile(const char* pattern)
{
if (pattern[i] == '\\')
{
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1)
if (re_compiled->data_len >= char_limit)
{
//fputs("exceeded internal buffer!\n", stderr);
return 0;
Expand All @@ -206,31 +216,32 @@ re_t re_compile(const char* pattern)
{
return 0;
}
ccl_buf[ccl_bufidx++] = pattern[i++];
re_compiled->data[re_compiled->data_len++] = pattern[i++];
}
else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
// TODO: I think this "else if" is a bug, should just be "if"
else if (re_compiled->data_len >= char_limit)
{
//fputs("exceeded internal buffer!\n", stderr);
return 0;
}
ccl_buf[ccl_bufidx++] = pattern[i];
re_compiled->data[re_compiled->data_len++] = pattern[i];
}
if (ccl_bufidx >= MAX_CHAR_CLASS_LEN)
if (re_compiled->data_len >= char_limit)
{
/* Catches cases such as [00000000000000000000000000000000000000][ */
//fputs("exceeded internal buffer!\n", stderr);
return 0;
}
/* Null-terminate string end */
ccl_buf[ccl_bufidx++] = 0;
re_compiled[j].u.ccl = &ccl_buf[buf_begin];
re_compiled->data[re_compiled->data_len++] = 0;
} break;

/* Other characters: */
default:
{
re_compiled[j].type = CHAR;
re_compiled[j].u.ch = c;
re_compiled->type = CHAR;
re_compiled->data_len = 1;
re_compiled->data[0] = c;
} break;
}
/* no buffer-out-of-bounds access on invalid patterns - see https://github.com/kokke/tiny-regex-c/commit/1a279e04014b70b0695fba559a7c05d55e6ee90b */
Expand All @@ -240,35 +251,35 @@ re_t re_compile(const char* pattern)
}

i += 1;
j += 1;
j += 2 + re_compiled->data_len;
}
/* 'UNUSED' is a sentinel used to indicate end-of-pattern */
re_compiled[j].type = UNUSED;
re_data[j] = UNUSED;
re_data[j+1] = 0;

return (re_t) re_compiled;
return (re_t) re_data;
}

void re_print(regex_t* pattern)
{
const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" };

int i;
int j;
char c;
for (i = 0; i < MAX_REGEXP_OBJECTS; ++i)
for (;; pattern = getnext(pattern))
{
if (pattern[i].type == UNUSED)
if (pattern->type == UNUSED)
{
break;
}

printf("type: %s", types[pattern[i].type]);
if (pattern[i].type == CHAR_CLASS || pattern[i].type == INV_CHAR_CLASS)
printf("type: %s", types[pattern->type]);
if (pattern->type == CHAR_CLASS || pattern->type == INV_CHAR_CLASS)
{
printf(" [");
for (j = 0; j < MAX_CHAR_CLASS_LEN; ++j)
for (j = 0; j < pattern->data_len; ++j)
{
c = pattern[i].u.ccl[j];
c = pattern->data[j];
if ((c == '\0') || (c == ']'))
{
break;
Expand All @@ -277,9 +288,9 @@ void re_print(regex_t* pattern)
}
printf("]");
}
else if (pattern[i].type == CHAR)
else if (pattern->type == CHAR)
{
printf(" '%c'", pattern[i].u.ch);
printf(" '%c'", pattern->data[0]);
}
printf("\n");
}
Expand Down Expand Up @@ -380,24 +391,25 @@ static int matchcharclass(char c, const char* str)
return 0;
}

static int matchone(regex_t p, char c)
static int matchone(regex_t* p, char c)
{
switch (p.type)
switch (p->type)
{
case DOT: return matchdot(c);
case CHAR_CLASS: return matchcharclass(c, (const char*)p.u.ccl);
case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p.u.ccl);
case CHAR_CLASS: return matchcharclass(c, (const char*)p->data);
case INV_CHAR_CLASS: return !matchcharclass(c, (const char*)p->data);
case DIGIT: return matchdigit(c);
case NOT_DIGIT: return !matchdigit(c);
case ALPHA: return matchalphanum(c);
case NOT_ALPHA: return !matchalphanum(c);
case WHITESPACE: return matchwhitespace(c);
case NOT_WHITESPACE: return !matchwhitespace(c);
default: return (p.u.ch == c);
case BEGIN: return 0;
default: return (p->data[0] == c);
}
}

static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchlength)
static int matchstar(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
{
int prelen = *matchlength;
const char* prepoint = text;
Expand All @@ -417,7 +429,7 @@ static int matchstar(regex_t p, regex_t* pattern, const char* text, int* matchle
return 0;
}

static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchlength)
static int matchplus(regex_t* p, regex_t* pattern, const char* text, int* matchlength)
{
const char* prepoint = text;
while ((text[0] != '\0') && matchone(p, *text))
Expand All @@ -435,10 +447,8 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text, int* matchle
return 0;
}

static int matchquestion(regex_t p, regex_t* pattern, const char* text, int* matchlength)
static int matchquestion(regex_t *p, regex_t* pattern, const char* text, int* matchlength)
{
if (p.type == UNUSED)
return 1;
if (matchpattern(pattern, text, matchlength))
return 1;
if (*text && matchone(p, *text++))
Expand Down Expand Up @@ -493,33 +503,42 @@ static int matchpattern(regex_t* pattern, const char* text, int *matchlength)
static int matchpattern(regex_t* pattern, const char* text, int* matchlength)
{
int pre = *matchlength;
do
while (1)
{
if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK))
if (pattern->type == UNUSED)
{
return 1;
}
regex_t* next_pattern = getnext(pattern);
if (next_pattern->type == QUESTIONMARK)
{
return matchquestion(pattern[0], &pattern[2], text, matchlength);
return matchquestion(pattern, getnext(next_pattern), text, matchlength);
}
else if (pattern[1].type == STAR)
else if (next_pattern->type == STAR)
{
return matchstar(pattern[0], &pattern[2], text, matchlength);
return matchstar(pattern, getnext(next_pattern), text, matchlength);
}
else if (pattern[1].type == PLUS)
else if (next_pattern->type == PLUS)
{
return matchplus(pattern[0], &pattern[2], text, matchlength);
return matchplus(pattern, getnext(next_pattern), text, matchlength);
}
else if ((pattern[0].type == END) && pattern[1].type == UNUSED)
else if ((pattern->type == END) && next_pattern->type == UNUSED)
{
return (text[0] == '\0');
}
/* Branching is not working properly
else if (pattern[1].type == BRANCH)
else if (pattern->type == BRANCH)
{
return (matchpattern(pattern, text) || matchpattern(&pattern[2], text));
return (matchpattern(pattern, text) || matchpattern(getnext(next_pattern), text));
}
*/
(*matchlength)++;
if (text[0] == '\0')
break;
if (!matchone(pattern, *text++))
break;
pattern = next_pattern;
}
while ((text[0] != '\0') && matchone(*pattern++, *text++));

*matchlength = pre;
return 0;
Expand Down

0 comments on commit a4ef7f9

Please sign in to comment.