I’m trying to build a high-performance, S-Expression based, C++/LLVM compiled DSL. I’ve written a small lexer for it, but I’m getting an infinite loop for all test cases I try without errors (which are caught and handled fine). What could the reason be?
#include <iostream>
#include <cstring>
#include <string>
#include <vector>
enum class TokenType {
BLOCK,
IF,
LET,
VAR,
QUOTE,
SET,
LAMBDA,
MACRO,
INTEGER,
FLOAT,
STRING,
BEGIN,
END,
SYMBOL,
BUILTIN,
TERMINATE
};
struct Token {
TokenType type;
std::string value;
};
class Lexer {
std::string code;
unsigned index = 0;
Token gettok(void) {
std::string data;
while (index < code.length() && code[index] != '(' && code[index] != ')' && code[index] != '"' && !std::isspace(code[index]))
data += code[index++];
// Handle (...)
if (data == "(") return { TokenType::BEGIN, data };
if (data == ")") return { TokenType::END, data };
// Handle $ID
if (data[0] == '$') return { TokenType::SYMBOL, data };
// Handle "..."
if (data == """) {
data = "";
while (index < code.length() && code[index] != '"') data += code[index++];
if (index == code.length()) throw "ERROR!";
return { TokenType::STRING, data };
}
// Handle numbers
if (std::isdigit(data[0]) || data[0] == '.') {
bool dot = false;
for (auto& c : data) {
if (c == '.' && dot) throw "ERROR!";
else if (c == '.') dot = true;
if (!std::isdigit(c) && c != '.') throw "ERROR!";
}
return { dot ? TokenType::FLOAT : TokenType::INTEGER, data };
}
// Handle special operators
if (data == "block") return { TokenType::BLOCK, data };
if (data == "if") return { TokenType::IF, data };
if (data == "let") return { TokenType::LET, data };
if (data == "var") return { TokenType::VAR, data };
if (data == "quote") return { TokenType::QUOTE, data };
if (data == "set") return { TokenType::SET, data };
if (data == "lambda") return { TokenType::LAMBDA, data };
if (data == "macro") return { TokenType::MACRO, data };
// Handle builtins
return { TokenType::BUILTIN, data };
}
public:
Lexer(std::string const& arg) : code(arg + " ") {}
std::vector<Token> lex(void) {
std::vector<Token> out;
Token next = gettok();
while (next.type != TokenType::TERMINATE) {
out.push_back(next);
next = gettok();
}
return out;
}
};
auto example = R"(
(function main (x) (block
(let $out 0)
$out)))";
int main(void) {
Lexer lexer(example);
auto foo = lexer.lex();
for (auto& bar : foo) std::cout << bar.value << std::endl;
}
I tried modifying the example, I rewrote the whole thing multiple times (this is the most recent version), stepped through the code, and tried asking ChatGPT & Copilot to see if they could catch the error (which they could not).