mirror of
synced 2025-03-14 02:49:39 +08:00
286 lines
8.9 KiB
286 lines
8.9 KiB
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "nfa.h"
#include "re2nfa.h"
#include "configfile.h"
#include "generator.h"
#include <QFile>
#include <QCoreApplication>
#include <QFileInfo>
#include <QDateTime>
struct Symbol
QString token;
QString lexem;
static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
QList<Symbol> symbols;
Symbol lastSymbol;
int state = 0;
int lastAcceptingState = -1;
QString lastAcceptingLexem;
int lastAcceptingPos = -1;
for (int i = 0; i < input.length(); ++i) {
QChar ch = input.at(i);
QChar chForInput = ch;
if (cfg->caseSensitivity == Qt::CaseInsensitive)
chForInput = chForInput.toLower();
int next = dfa.at(state).transitions.value(chForInput.unicode());
if (cfg->debug)
qDebug() << "input" << input.at(i) << "leads to state" << next;
if (next) {
lastSymbol.token = dfa.at(next).symbol;
if (!lastSymbol.token.isEmpty()) {
lastAcceptingState = next;
lastAcceptingLexem = lastSymbol.lexem;
lastAcceptingPos = i;
state = next;
} else {
if (lastAcceptingState != -1) {
if (cfg->debug)
qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
Symbol s;
s.token = dfa.at(lastAcceptingState).symbol;
s.lexem = lastAcceptingLexem;
symbols << s;
lastSymbol = Symbol();
state = 0;
i = lastAcceptingPos;
lastAcceptingPos = -1;
lastAcceptingState = -1;
if (state == 0 || lastSymbol.token.isEmpty()) {
if (cfg->debug)
qDebug() << "invalid input";
if (ok)
*ok = false;
return symbols;
if (cfg->debug)
qDebug() << "appending symbol with token" << lastSymbol.token;
symbols << lastSymbol;
lastSymbol = Symbol();
state = 0;
lastAcceptingState = -1;
if (!lastSymbol.token.isEmpty()) {
if (cfg->debug)
qDebug() << "appending (last) symbol with token" << lastSymbol.token;
symbols << lastSymbol;
} else if (lastAcceptingState != -1) {
if (cfg->debug)
qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
Symbol s;
s.lexem = lastAcceptingLexem;
s.token = dfa.at(lastAcceptingState).symbol;
symbols << s;
if (ok)
*ok = true;
return symbols;
static QSet<InputType> determineMaxInputSet(const ConfigFile::Section §ion)
QSet<InputType> set;
QString inputTypeName;
foreach (const ConfigFile::Entry &entry, section)
if (entry.key == QLatin1String("InputType")) {
if (!inputTypeName.isEmpty()) {
qWarning("Error: InputType field specified multiple times in config file");
return QSet<InputType>();
inputTypeName = entry.value;
if (inputTypeName.isEmpty())
inputTypeName = "quint8";
if (inputTypeName == "quint8") {
for (int i = 1; i < 256; ++i)
} /* else if ### */
else {
qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
return QSet<InputType>();
return set;
static bool loadConfig(const QString &ruleFile, Config *cfg)
ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
if (sections.isEmpty()) {
qWarning("Error parsing %s", qPrintable(ruleFile));
return false;
QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
if (maxInputSet.isEmpty())
return false;
Qt::CaseSensitivity cs = Qt::CaseInsensitive;
if (sections.value("Options").contains("case-sensitive"))
cs = Qt::CaseSensitive;
cfg->configSections = sections;
cfg->caseSensitivity = cs;
cfg->className = sections.value("Options").value("classname", "Scanner");
cfg->maxInputSet = maxInputSet;
cfg->ruleFile = ruleFile;
return true;
static DFA generateMachine(const Config &cfg)
if (cfg.cache) {
QFileInfo ruleInfo(cfg.ruleFile);
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
if (cacheInfo.exists()
&& cacheInfo.lastModified() > ruleInfo.lastModified()) {
QFile f(cacheInfo.absoluteFilePath());
QDataStream stream(&f);
DFA machine;
stream >> machine;
return machine;
QMap<QString, NFA> macros;
foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
int errCol = 0;
if (cfg.debug)
qDebug() << "parsing" << e.value;
NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
if (nfa.isEmpty()) {
qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
return DFA();
macros.insert(e.key, nfa);
if (!cfg.configSections.contains("Tokens")) {
qWarning("Rule file does not contain a [Tokens] section!");
return DFA();
QList<NFA> tokens;
foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
int errCol = 0;
if (cfg.debug)
qDebug() << "parsing" << e.value;
NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
if (tok.isEmpty()) {
qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
return DFA();
NFA giganticStateMachine;
foreach (NFA nfa, tokens)
if (giganticStateMachine.isEmpty())
giganticStateMachine = nfa;
giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
DFA result = giganticStateMachine.toDFA().minimize();
if (cfg.cache) {
QFileInfo ruleInfo(cfg.ruleFile);
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
QFile f(cacheInfo.absoluteFilePath());
f.open(QIODevice::WriteOnly | QIODevice::Truncate);
QDataStream stream(&f);
stream << result;
return result;
#if !defined(AUTOTEST)
int main(int argc, char **argv)
QCoreApplication app(argc, argv);
QString ruleFile;
Config cfg;
const QStringList arguments = app.arguments().mid(1);
cfg.debug = arguments.contains("-debug");
const bool testRules = arguments.contains("-test");
cfg.cache = arguments.contains("-cache");
foreach (const QString &arg, arguments)
if (!arg.startsWith(QLatin1Char('-'))) {
ruleFile = arg;
if (ruleFile.isEmpty()) {
qWarning("usage: lexgen [-debug] [-cache] [-test] rulefile");
qWarning(" ");
qWarning(" the -test option will cause lexgen to interpret standard input");
qWarning(" according to the specified rules and print out pairs of token and");
qWarning(" lexical element");
return 1;
if (!loadConfig(ruleFile, &cfg))
return 1;
DFA machine = generateMachine(cfg);
if (machine.isEmpty())
return 1;
if (testRules) {
QString input = QTextStream(stdin).readAll();
qDebug() << "NFA has" << machine.stateCount() << "states";
qDebug() << "Converting to DFA... (this may take a while)";
DFA dfa = machine.toDFA();
qDebug() << "DFA has" << dfa.count() << "states";
qDebug() << "Minimizing...";
dfa = dfa.minimize();
qDebug() << "Minimized DFA has" << dfa.count() << "states";
DFA dfa = machine;
if (cfg.debug)
qDebug() << "tokenizing" << input;
bool ok = false;
QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
if (symbols.isEmpty()) {
qWarning("No tokens produced!");
} else {
foreach (Symbol s, symbols)
qDebug() << s.token << ":" << s.lexem;
if (ok)
qDebug() << symbols.count() << "tokens produced.";
qDebug() << "Error while tokenizing!";
} else {
Generator gen(machine, cfg);
<< gen.generate();
return 0;