I I am working with POSIX regex functions to perform string matching in my C program. I have set up the matching rules and initialized them properly. Within my make_token
function, I am invoking the regexec
function to sequentially check if the incoming characters match the specified patterns.
static struct rule {
const char *regex;
int token_type;
} rules[] = {
{" +", TK_NOTYPE}, // spaces
{"\+", '+'}, // plus
{"\-", '-'},
{"\*", '*'}, // multiply or TK_DEREF
{"\/", '/'},
{"\(", '('},
{"\)", ')'},
{"==", TK_EQ},
{"!=", TK_NEQ},
{"&&", TK_AND},
{"\$[a-zA-Z0-9]+",TK_REG},
{"0x[0-9a-fA-F]+",TK_HEX},
{"[0-9]+", TK_DEC},
};
#define NR_REGEX ARRLEN(rules)
static regex_t re[NR_REGEX] = {};
void init_regex() {
int i;
char error_msg[128];
int ret;
for (i = 0; i < NR_REGEX; i ++) {
ret = regcomp(&re[i], rules[i].regex, REG_EXTENDED);
if (ret != 0) {
regerror(ret, &re[i], error_msg, 128);
panic("regex compilation failed: %sn%s", error_msg, rules[i].regex);
}
}
}
typedef struct token {
int type;
char str[32];
} Token;
static Token tokens[32] __attribute__((used)) = {};
static int nr_token __attribute__((used)) = 0;
static bool make_token(char *e) {
int position = 0;
int i;
regmatch_t pmatch;
nr_token = 0;
while (e[position] != '') {
/* Try all rules one by one. */
for (i = 0; i < NR_REGEX; i ++) {
if (regexec(&re[i], e + position, 1, &pmatch, 0) == 0 && pmatch.rm_so == 0) {
char *substr_start = e + position;
int substr_len = pmatch.rm_eo;
// Log("match rules[%d] = "%s" at position %d with len %d: %.*s",
// i, rules[i].regex, position, substr_len, substr_len, substr_start);
position += substr_len;
/* TODO: Now a new token is recognized with rules[i]. Add codes
* to record the token in the array `tokens'. For certain types
* of tokens, some extra actions should be performed.
*/
switch (rules[i].token_type) {
case TK_DEC: case TK_HEX:
{
if(substr_len > 31) {
Log("The input number is too long!");
return false;
}
tokens[nr_token].type = rules[i].token_type;
strncpy(tokens[nr_token].str, substr_start, substr_len);
tokens[nr_token].str[substr_len] = '';
++nr_token;
break;
}
case TK_NOTYPE: break;
default:
{
tokens[nr_token].type = rules[i].token_type;
strncpy(tokens[nr_token].str, substr_start, substr_len);
tokens[nr_token].str[substr_len] = '';
++nr_token;
break;
}
}
if(nr_token > 32) {
Log("The token number of the input expression is too large!");
return false;
}
break;
}
}
if (i == NR_REGEX) {
printf("no match at position %dn%sn%*.s^n", position, e, position, "");
return false;
}
} // end while
return true;
}
Behavior Without AddressSanitizer:
When I compile and run my program without AddressSanitizer, it executes successfully and performs the string matching as expected.
Behavior With AddressSanitizer:
Upon enabling AddressSanitizer, my program throws a heap-buffer-overflow error
. Debugging with gdb revealed that the error is triggered during the execution of the regexec
function.
**Specific Error Message: **
==14631==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x602000003a16 at pc 0x7f017febcfd9 bp 0x7ffd16c98e30 sp 0x7ffd16c985d8
READ of size 5 at 0x602000003a16 thread T0
#0 0x7f017febcfd8 in __interceptor_regexec ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:7756
#1 0x55c052b5a2af in make_token src/monitor/sdb/expr.c:103
#2 0x55c052b5b304 in expr src/monitor/sdb/expr.c:252
#3 0x55c052b5b8b2 in cmd_p src/monitor/sdb/sdb.c:135
#4 0x55c052b5bead in sdb_mainloop src/monitor/sdb/sdb.c:247
#5 0x55c052b5dff1 in engine_start src/engine/interpreter/init.c:25
#6 0x55c052b572df in main src/nemu-main.c:37
#7 0x7f017a8d3d8f in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58
#8 0x7f017a8d3e3f in __libc_start_main_impl ../csu/libc-start.c:392
#9 0x55c052b57204 in _start (/home/xyr/ics2023/nemu/build/riscv32-nemu-interpreter+0xc204)
0x602000003a16 is located 0 bytes to the right of 6-byte region [0x602000003a10,0x602000003a16)
allocated by thread T0 here:
#0 0x7f017ff1f887 in __interceptor_malloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:145
#1 0x7f017fe50bac in xmalloc (/lib/x86_64-linux-gnu/libreadline.so.8+0x39bac)
SUMMARY: AddressSanitizer: heap-buffer-overflow ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:7756 in __interceptor_regexec
Shadow bytes around the buggy address:
0x0c047fff86f0: fa fa fd fa fa fa fd fa fa fa fd fa fa fa fd fa
0x0c047fff8700: fa fa fd fa fa fa fd fd fa fa fd fd fa fa fd fd
0x0c047fff8710: fa fa fd fd fa fa fd fd fa fa fd fd fa fa fd fd
0x0c047fff8720: fa fa fd fd fa fa fd fd fa fa fd fd fa fa fd fd
0x0c047fff8730: fa fa fd fd fa fa fd fd fa fa fd fd fa fa 00 00
=>0x0c047fff8740: fa fa[06]fa fa fa 00 04 fa fa 06 fa fa fa fa fa
0x0c047fff8750: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c047fff8760: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c047fff8770: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c047fff8780: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c047fff8790: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
Shadow gap: cc
==14631==ABORTING
I am seeking insight into why the heap-buffer-overflow error might be triggered when regexec is called, despite the program working fine without AddressSanitizer. Any assistance you could provide would be greatly appreciated.
Environment:
Ubuntu 22.04.4 LTS
gcc 11.4.0