qt 6.5.1 original

This commit is contained in:
kleuter
2023-10-29 23:33:08 +01:00
parent 71d22ab6b0
commit 85d238dfda
21202 changed files with 5499099 additions and 0 deletions

15
util/lexgen/README Normal file
View File

@ -0,0 +1,15 @@
Lexgen
------
This is a little tool to generate lexical scanners from a rather simplistic
configuration file. We use it internally in Qt to generate the scanner for the
CSS parser that is built into the toolkit (used for the widget styling and the
HTML import into QTextDocument).
Beware, it's very slow (in generating the code) and it may not generate what
you want. But I like that it generates code that operates on QChar and friends.
Use at your own risk ;-)
--
Simon Hausmann <simon.hausmann@qt.io>

View File

@ -0,0 +1,61 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "configfile.h"
#include <QFile>
ConfigFile::SectionMap ConfigFile::parse(const QString &fileName)
{
QFile f(fileName);
if (!f.open(QIODevice::ReadOnly))
return ConfigFile::SectionMap();
return parse(&f);
}
ConfigFile::SectionMap ConfigFile::parse(QIODevice *dev)
{
SectionMap sections;
SectionMap::Iterator currentSection = sections.end();
ConfigFile::SectionMap result;
int currentLineNumber = 0;
while (!dev->atEnd()) {
QString line = QString::fromUtf8(dev->readLine()).trimmed();
++currentLineNumber;
if (line.isEmpty() || line.startsWith(QLatin1Char('#')))
continue;
if (line.startsWith(QLatin1Char('['))) {
if (!line.endsWith(']')) {
qWarning("Syntax error at line %d: Missing ']' at start of new section.", currentLineNumber);
return SectionMap();
}
line.remove(0, 1);
line.chop(1);
const QString sectionName = line;
currentSection = sections.insert(sectionName, Section());
continue;
}
if (currentSection == sections.end()) {
qWarning("Syntax error at line %d: Entry found outside of any section.", currentLineNumber);
return SectionMap();
}
Entry e;
e.lineNumber = currentLineNumber;
int equalPos = line.indexOf(QLatin1Char('='));
if (equalPos == -1) {
e.key = line;
} else {
e.key = line;
e.key.truncate(equalPos);
e.key = e.key.trimmed();
e.value = line.mid(equalPos + 1).trimmed();
}
currentSection->append(e);
}
return sections;
}

44
util/lexgen/configfile.h Normal file
View File

@ -0,0 +1,44 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#ifndef CONFIGFILE_H
#define CONFIGFILE_H
#include <QIODevice>
#include <QList>
#include <QMap>
#include <QStringList>
struct ConfigFile
{
struct Entry
{
inline Entry() : lineNumber(-1) {}
int lineNumber;
QString key;
QString value;
};
struct Section : public QList<Entry>
{
inline bool contains(const QString &key) const
{
for (int i = 0; i < count(); ++i)
if (at(i).key == key)
return true;
return false;
}
inline QString value(const QString &key, const QString &defaultValue = QString()) const
{
for (int i = 0; i < count(); ++i)
if (at(i).key == key)
return at(i).value;
return defaultValue;
}
};
typedef QMap<QString, Section> SectionMap;
static SectionMap parse(const QString &fileName);
static SectionMap parse(QIODevice *dev);
};
#endif // CONFIGFILE_H

View File

@ -0,0 +1,97 @@
[Options]
case-sensitive
classname = QCssScanner_Generated
[Code Generator Options]
MapToCode[a-z] = (ch.unicode() >= 'a' && ch.unicode() <= 'z') || (ch.unicode() >= 'A' && ch.unicode() <= 'Z') || ch.unicode() >= 256
TokenPrefix = QCss::
FileHeader = ../../header.LGPL
[Macros]
escape = \\[^\r\n\f0-9a-f]
nmstart = [_a-z]|{escape}
nmchar = [_a-z0-9-]|{escape}
nl = \n|\r\n|\r|\f
string1 = \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
string2 = \'([^\n\r\f\\']|\\{nl}|{escape})*\'
invalid1 = \"([^\n\r\f\\"]|\\{nl}|{escape})*
invalid2 = \'([^\n\r\f\\']|\\{nl}|{escape})*
ident = -?{nmstart}{nmchar}*
name = {nmchar}+
num = [0-9]+|[0-9]*"."[0-9]+
string = {string1}|{string2}
invalid = {invalid1}|{invalid2}
url = ([!#$%&*-~]|{escape})*
s = [ \t\r\n\f]
w = {s}*
[Tokens]
S = {s}+
handleCommentStart() = \/\*
CDO = "<!--"
CDC = "-->"
INCLUDES = "~="
DASHMATCH = "|="
BEGINSWITH = "^="
ENDSWITH = "$="
CONTAINS = "*="
LBRACE = {w}"{"
PLUS = {w}"+"
GREATER = {w}">"
COMMA = {w}","
TILDE= {w}"~"
STRING = {string}
INVALID = {invalid}
IDENT = {ident}
HASH = "#"{name}
ATKEYWORD_SYM = "@"{ident}
EXCLAMATION_SYM = "!"
#EMS = {num}em
#EXS = {num}ex
#LENGTH = {num}px
#LENGTH = {num}cm
#LENGTH = {num}mm
#LENGTH = {num}in
#LENGTH = {num}pt
#LENGTH = {num}pc
#ANGLE = {num}deg
#ANGLE = {num}rad
#ANGLE = {num}grad
#TIME = {num}ms
#TIME = {num}s
#FREQ = {num}hz
#FREQ = {num}khz
#DIMENSION = {num}{ident}
LENGTH = {num}{ident}
PERCENTAGE = {num}%
NUMBER = {num}
#URI = "url("{w}{string}{w}")"
#URI = "url("{w}{url}{w}")"
FUNCTION = {ident}"("
COLON = :
SEMICOLON = ;
RBRACE = \}
SLASH = /
MINUS = -
DOT = \.
STAR = \*
LBRACKET = \[
RBRACKET = \]
EQUAL = \=
LPAREN = \(
RPAREN = \)
OR = \|

497
util/lexgen/generator.cpp Normal file
View File

@ -0,0 +1,497 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "generator.h"
#include <QFile>
#include <QDir>
void Function::printDeclaration(CodeBlock &block, const QString &funcNamePrefix) const
{
block << (iline ? "inline " : "") << signature(funcNamePrefix) << (iline ? QLatin1String(" {") : QLatin1String(";"));
if (!iline)
return;
block.indent();
QString tmp = body;
if (tmp.endsWith(QLatin1Char('\n')))
tmp.chop(1);
foreach (QString line, tmp.split(QLatin1Char('\n')))
block << line;
block.outdent();
block << "}";
}
QString Function::signature(const QString &funcNamePrefix) const
{
QString sig;
if (!rtype.isEmpty()) {
sig += rtype;
sig += QLatin1Char(' ');
}
sig += funcNamePrefix;
sig += fname;
if (cnst)
sig += " const";
return sig;
}
QString Function::definition() const
{
if (iline)
return QString();
QString result;
result += signature();
result += QLatin1String("\n{\n");
QString tmp = body;
if (tmp.endsWith(QLatin1Char('\n')))
tmp.chop(1);
if (!tmp.startsWith(QLatin1Char('\n')))
tmp.prepend(" ");
tmp.replace(QLatin1Char('\n'), QLatin1String("\n "));
result += tmp;
result += QLatin1String("\n}\n");
return result;
}
void Class::Section::printDeclaration(const Class *klass, CodeBlock &block) const
{
foreach (Function ctor, constructors)
ctor.printDeclaration(block, klass->name());
if (!constructors.isEmpty())
block.addNewLine();
foreach (Function func, functions)
func.printDeclaration(block);
if (!functions.isEmpty())
block.addNewLine();
foreach (QString var, variables)
block << var << ';';
}
void Class::addConstructor(Access access, const QString &body, const QString &_args)
{
Function ctor;
QString args = _args;
if (!args.startsWith(QLatin1Char('('))
&& !args.endsWith(QLatin1Char(')'))) {
args.prepend('(');
args.append(')');
}
ctor.setName(args);
ctor.addBody(body);
sections[access].constructors.append(ctor);
}
QString Class::Section::definition(const Class *klass) const
{
QString result;
foreach (Function ctor, constructors) {
ctor.setName(klass->name() + "::" + klass->name() + ctor.name());
result += ctor.definition();
result += QLatin1Char('\n');
}
foreach (Function func, functions) {
if (!func.hasBody()) continue;
func.setName(klass->name() + "::" + func.name());
result += func.definition();
result += QLatin1Char('\n');
}
return result;
}
QString Class::declaration() const
{
CodeBlock block;
block << QLatin1String("class ") << cname;
block << "{";
if (!sections[PublicMember].isEmpty()) {
block << "public:";
block.indent();
sections[PublicMember].printDeclaration(this, block);
block.outdent();
}
if (!sections[ProtectedMember].isEmpty()) {
block << "protected:";
block.indent();
sections[ProtectedMember].printDeclaration(this, block);
block.outdent();
}
if (!sections[PrivateMember].isEmpty()) {
block << "private:";
block.indent();
sections[PrivateMember].printDeclaration(this, block);
block.outdent();
}
block << "};";
block.addNewLine();
return block.toString();
}
QString Class::definition() const
{
return sections[PrivateMember].definition(this)
+ sections[ProtectedMember].definition(this)
+ sections[PublicMember].definition(this);
}
Generator::Generator(const DFA &_dfa, const Config &config)
: dfa(_dfa), cfg(config)
{
QList<InputType> lst = cfg.maxInputSet.toList();
std::sort(lst.begin(), lst.end());
minInput = lst.first();
maxInput = lst.last();
ConfigFile::Section section = config.configSections.value("Code Generator Options");
foreach (ConfigFile::Entry entry, section) {
if (!entry.key.startsWith(QLatin1String("MapToCode["))
|| !entry.key.endsWith(QLatin1Char(']')))
continue;
QString range = entry.key;
range.remove(0, qstrlen("MapToCode["));
range.chop(1);
if (range.length() != 3
|| range.at(1) != QLatin1Char('-')) {
qWarning("Invalid range for char mapping function: %s", qPrintable(range));
continue;
}
TransitionSequence seq;
seq.first = range.at(0).unicode();
seq.last = range.at(2).unicode();
seq.testFunction = entry.value;
charFunctionRanges.append(seq);
}
QString tokenPrefix = section.value("TokenPrefix");
if (!tokenPrefix.isEmpty()) {
for (int i = 0; i < dfa.count(); ++i)
if (!dfa.at(i).symbol.isEmpty()
&& !dfa.at(i).symbol.endsWith(QLatin1String("()")))
dfa[i].symbol.prepend(tokenPrefix);
}
headerFileName = section.value("FileHeader");
}
static inline bool adjacentKeys(int left, int right) { return left + 1 == right; }
//static inline bool adjacentKeys(const InputType &left, const InputType &right)
//{ return left.val + 1 == right.val; }
static QList<Generator::TransitionSequence> convertToSequences(const TransitionMap &transitions)
{
QList<Generator::TransitionSequence> sequences;
if (transitions.isEmpty())
return sequences;
QList<InputType> keys = transitions.keys();
std::sort(keys.begin(), keys.end());
int i = 0;
Generator::TransitionSequence sequence;
sequence.first = keys.at(0);
++i;
for (; i < keys.count(); ++i) {
if (adjacentKeys(keys.at(i - 1), keys.at(i))
&& transitions.value(keys.at(i)) == transitions.value(keys.at(i - 1))) {
continue;
}
sequence.last = keys.at(i - 1);
sequence.transition = transitions.value(sequence.last);
sequences.append(sequence);
sequence.first = keys.at(i);
}
sequence.last = keys.at(i - 1);
sequence.transition = transitions.value(sequence.last);
sequences.append(sequence);
return sequences;
}
QDebug &operator<<(QDebug &debug, const Generator::TransitionSequence &seq)
{
return debug << "[first:" << seq.first << "; last:" << seq.last << "; transition:" << seq.transition
<< (seq.testFunction.isEmpty() ? QString() : QString(QString("; testfunction:" + seq.testFunction)))
<< "]";
}
bool Generator::isSingleReferencedFinalState(int i) const
{
return backReferenceMap.value(i) == 1
&& dfa.at(i).transitions.isEmpty()
&& !dfa.at(i).symbol.isEmpty();
}
void Generator::generateTransitions(CodeBlock &body, const TransitionMap &transitions)
{
if (transitions.isEmpty())
return;
QList<TransitionSequence> sequences = convertToSequences(transitions);
bool needsCharFunction = false;
if (!charFunctionRanges.isEmpty()) {
int i = 0;
while (i < sequences.count()) {
const TransitionSequence &seq = sequences.at(i);
if (!seq.testFunction.isEmpty()) {
++i;
continue;
}
foreach (TransitionSequence range, charFunctionRanges)
if (range.first >= seq.first && range.last <= seq.last) {
needsCharFunction = true;
TransitionSequence left, middle, right;
left.first = seq.first;
left.last = range.first - 1;
left.transition = seq.transition;
middle = range;
middle.transition = seq.transition;
right.first = range.last + 1;
right.last = seq.last;
right.transition = seq.transition;
sequences.remove(i);
if (left.last >= left.first) {
sequences.insert(i, left);
++i;
}
sequences.insert(i, middle);
++i;
if (right.last >= right.first) {
sequences.insert(i, right);
++i;
}
i = -1;
break;
}
++i;
}
}
//qDebug() << "sequence count" << sequences.count();
//qDebug() << sequences;
if (sequences.count() < 10
|| sequences.last().last == maxInput
|| needsCharFunction) {
foreach (TransitionSequence seq, sequences) {
const bool embedFinalState = isSingleReferencedFinalState(seq.transition);
QString brace;
if (embedFinalState)
brace = " {";
if (!seq.testFunction.isEmpty()) {
body << "if (" << seq.testFunction << ")" << brace;
} else if (seq.first == seq.last) {
body << "if (ch.unicode() == " << seq.first << ")" << brace;
} else {
if (seq.last < maxInput)
body << "if (ch.unicode() >= " << seq.first
<< " && ch.unicode() <= " << seq.last << ")" << brace;
else
body << "if (ch.unicode() >= " << seq.first << ")" << brace;
}
body.indent();
if (embedFinalState) {
body << "token = " << dfa.at(seq.transition).symbol << ";";
body << "goto found;";
body.outdent();
body << "}";
} else {
body << "goto state_" << seq.transition << ";";
body.outdent();
}
}
} else {
QList<InputType> keys = transitions.keys();
std::sort(keys.begin(), keys.end());
body << "switch (ch.unicode()) {";
body.indent();
for (int k = 0; k < keys.count(); ++k) {
const InputType key = keys.at(k);
const int trans = transitions.value(key);
QString keyStr;
if (key == '\\')
keyStr = QString("\'\\\\\'");
else if (key >= 48 && key < 127)
keyStr = QString('\'') + QChar::fromLatin1(char(key)) + QChar('\'');
else
keyStr = QString::number(key);
if (k < keys.count() - 1
&& transitions.value(keys.at(k + 1)) == trans) {
body << "case " << keyStr << ":";
} else {
if (isSingleReferencedFinalState(trans)) {
body << "case " << keyStr << ": token = " << dfa.at(trans).symbol << "; goto found;";
} else {
body << "case " << keyStr << ": goto state_" << trans << ";";
}
}
}
body.outdent();
body << "}";
}
}
QString Generator::generate()
{
Class klass(cfg.className);
klass.addMember(Class::PublicMember, "QString input");
klass.addMember(Class::PublicMember, "int pos");
klass.addMember(Class::PublicMember, "int lexemStart");
klass.addMember(Class::PublicMember, "int lexemLength");
{
CodeBlock body;
body << "input = inp;";
body << "pos = 0;";
body << "lexemStart = 0;";
body << "lexemLength = 0;";
klass.addConstructor(Class::PublicMember, body, "const QString &inp");
}
{
Function next("QChar", "next()");
next.setInline(true);
if (cfg.caseSensitivity == Qt::CaseSensitive)
next.addBody("return (pos < input.length()) ? input.at(pos++) : QChar();");
else
next.addBody("return (pos < input.length()) ? input.at(pos++).toLower() : QChar();");
klass.addMember(Class::PublicMember, next);
}
/*
{
Function lexem("QString", "lexem()");
lexem.setConst(true);
lexem.setInline(true);
lexem.addBody("return input.mid(lexemStart, lexemLength);");
klass.addMember(Class::PublicMember, lexem);
}
*/
for (int i = 0; i < dfa.count(); ++i)
if (dfa.at(i).symbol.endsWith(QLatin1String("()"))) {
Function handlerFunc("int", dfa.at(i).symbol);
klass.addMember(Class::PublicMember, handlerFunc);
}
Function lexFunc;
lexFunc.setReturnType("int");
lexFunc.setName("lex()");
CodeBlock body;
body << "lexemStart = pos;";
body << "lexemLength = 0;";
body << "int lastAcceptingPos = -1;";
body << "int token = -1;";
body << "QChar ch;";
body.addNewLine();
backReferenceMap.clear();
foreach (State s, dfa)
foreach (int state, s.transitions)
backReferenceMap[state]++;
bool haveSingleReferencedFinalState = false;
for (int i = 0; i < dfa.count(); ++i) {
if (isSingleReferencedFinalState(i)) {
haveSingleReferencedFinalState = true;
continue;
}
if (i > 0)
body << "state_" << i << ":";
else
body << "// initial state";
body.indent();
if (!dfa.at(i).symbol.isEmpty()) {
body << "lastAcceptingPos = pos;";
body << "token = " << dfa.at(i).symbol << ";";
}
body.outdent();
body.indent();
if (!dfa.at(i).transitions.isEmpty()) {
body << "ch = next();";
generateTransitions(body, dfa.at(i).transitions);
}
body << "goto out;";
body.outdent();
}
if (haveSingleReferencedFinalState) {
body << "found:";
body << "lastAcceptingPos = pos;";
body.addNewLine();
}
body << "out:";
body << "if (lastAcceptingPos != -1) {";
body.indent();
body << "lexemLength = lastAcceptingPos - lexemStart;";
body << "pos = lastAcceptingPos;";
body.outdent();
body << "}";
body << "return token;";
lexFunc.addBody(body);
klass.addMember(Class::PublicMember, lexFunc);
QString header;
if (!headerFileName.isEmpty()) {
QString self(QDir::fromNativeSeparators(QStringLiteral(__FILE__)));
int lastSep = self.lastIndexOf(QChar('/'));
QDir here(lastSep < 0 ? QStringLiteral(".") : self.left(lastSep));
QFile headerFile(QDir::cleanPath(here.filePath(headerFileName)));
if (headerFile.exists() && headerFile.open(QIODevice::ReadOnly))
header = QString::fromUtf8(headerFile.readAll());
}
header += QLatin1String("// auto generated by qtbase/util/lexgen/. DO NOT EDIT.\n");
return header + klass.declaration() + klass.definition();
}

183
util/lexgen/generator.h Normal file
View File

@ -0,0 +1,183 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#ifndef GENERATOR_H
#define GENERATOR_H
#include <QTextStream>
#include <QStringList>
#include "nfa.h"
class LineStream
{
private:
struct SharedStream
{
int ref;
QTextStream *stream;
};
public:
LineStream(QTextStream *textStream)
{
shared = new SharedStream;
shared->ref = 1;
shared->stream = textStream;
}
LineStream(const LineStream &other)
{
shared = other.shared;
shared->ref++;
}
LineStream &operator=(const LineStream &other)
{
if (this == &other)
return *this;
LineStream copy(other); // keep refcount up
qSwap(*shared, *other.shared);
return *this;
}
~LineStream()
{
if (!--shared->ref) {
(*shared->stream) << endl;
delete shared;
}
}
template <typename T>
LineStream &operator<<(const T &value)
{ (*shared->stream) << value; return *this; }
SharedStream *shared;
};
class CodeBlock
{
public:
inline CodeBlock() { stream.setString(&output, QIODevice::WriteOnly); }
inline void indent() { indentStr += QLatin1String(" "); }
inline void outdent() { indentStr.remove(0, 4); }
template <typename T>
LineStream operator<<(const T &value)
{ stream << indentStr; stream << value; return LineStream(&stream); }
inline void addNewLine() { stream << endl; }
inline QString toString() const { stream.flush(); return output; }
private:
QString output;
mutable QTextStream stream;
QString indentStr;
};
class Function
{
public:
inline Function(const QString &returnType, const QString &name)
: rtype(returnType), fname(name), iline(false), cnst(false) {}
inline Function() : iline(false), cnst(false) {}
inline void setName(const QString &name) { fname = name; }
inline QString name() const { return fname; }
inline void setInline(bool i) { iline = i; }
inline bool isInline() const { return iline; }
inline void setReturnType(const QString &type) { rtype = type; }
inline QString returnType() const { return rtype; }
inline void addBody(const QString &_body) { body += _body; }
inline void addBody(const CodeBlock &block) { body += block.toString(); }
inline bool hasBody() const { return !body.isEmpty(); }
inline void setConst(bool konst) { cnst = konst; }
inline bool isConst() const { return cnst; }
void printDeclaration(CodeBlock &block, const QString &funcNamePrefix = QString()) const;
QString definition() const;
private:
QString signature(const QString &funcNamePrefix = QString()) const;
QString rtype;
QString fname;
QString body;
bool iline;
bool cnst;
};
class Class
{
public:
enum Access { PublicMember, ProtectedMember, PrivateMember };
inline Class(const QString &name) : cname(name) {}
inline void setName(const QString &name) { cname = name; }
inline QString name() const { return cname; }
inline void addMember(Access access, const QString &name)
{ sections[access].variables.append(name); }
inline void addMember(Access access, const Function &func)
{ sections[access].functions.append(func); }
void addConstructor(Access access, const QString &body, const QString &args = QString());
inline void addConstructor(Access access, const CodeBlock &body, const QString &args = QString())
{ addConstructor(access, body.toString(), args); }
QString declaration() const;
QString definition() const;
private:
QString cname;
struct Section
{
QList<Function> functions;
QStringList variables;
QList<Function> constructors;
inline bool isEmpty() const
{ return functions.isEmpty() && variables.isEmpty() && constructors.isEmpty(); }
void printDeclaration(const Class *klass, CodeBlock &block) const;
QString definition(const Class *klass) const;
};
Section sections[3];
};
class Generator
{
public:
Generator(const DFA &dfa, const Config &config);
QString generate();
private:
void generateTransitions(CodeBlock &body, const TransitionMap &transitions);
bool isSingleReferencedFinalState(int i) const;
DFA dfa;
Config cfg;
InputType minInput;
InputType maxInput;
QHash<int, int> backReferenceMap;
QString headerFileName;
public:
struct TransitionSequence
{
inline TransitionSequence() : first(-1), last(-1), transition(-1) {}
InputType first;
InputType last;
int transition;
QString testFunction;
};
private:
QList<TransitionSequence> charFunctionRanges;
};
#endif

75
util/lexgen/global.h Normal file
View File

@ -0,0 +1,75 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#ifndef GLOBAL_H
#define GLOBAL_H
#include <QHash>
#include <QDataStream>
#include <QSet>
#include "configfile.h"
#if 1
typedef int InputType;
enum SpecialInputType {
DigitInput,
SpaceInput,
Letter
};
#else
enum SpecialInputType {
NoSpecialInput = 0,
DigitInput,
SpaceInput,
LetterOrNumberInput
};
struct InputType
{
inline InputType() : val(0), specialInput(NoSpecialInput) {}
inline InputType(const int &val) : val(val), specialInput(NoSpecialInput) {}
inline operator int() const { return val; }
inline bool operator==(const InputType &other) const
{ return val == other.val; }
inline bool operator!=(const InputType &other) const
{ return val != other.val; }
int val;
SpecialInputType specialInput;
};
inline int qHash(const InputType &t) { return qHash(t.val); }
inline QDataStream &operator<<(QDataStream &stream, const InputType &i)
{
return stream << i;
}
inline QDataStream &operator>>(QDataStream &stream, InputType &i)
{
return stream >> i;
}
#endif
const InputType Epsilon = -1;
struct Config
{
inline Config() : caseSensitivity(Qt::CaseSensitive), debug(false), cache(false) {}
QSet<InputType> maxInputSet;
Qt::CaseSensitivity caseSensitivity;
QString className;
bool debug;
bool cache;
QString ruleFile;
ConfigFile::SectionMap configSections;
};
#endif // GLOBAL_H

24
util/lexgen/lexgen.lexgen Normal file
View File

@ -0,0 +1,24 @@
[Options]
case-sensitive
classname = RegExpTokenizer
[Code Generator Options]
TokenPrefix = RE2NFA::
[Macros]
Escape = \\.{1}
[Tokens]
TOK_QUOTED_STRING = \"[^\"]*\"
TOK_STRING = [^\{\}\(\)\,\*\|\?\.\+\[]|{Escape}
TOK_SEQUENCE = \[([^\]]|(\\\]))*\]
TOK_LBRACE = \{
TOK_RBRACE = \}
TOK_LPAREN = \(
TOK_RPAREN = \)
TOK_COMMA = \,
TOK_STAR = \*
TOK_OR = \|
TOK_QUESTION = \?
TOK_DOT = \.
TOK_PLUS = \+

3
util/lexgen/lexgen.pri Normal file
View File

@ -0,0 +1,3 @@
VPATH += $$PWD
SOURCES += nfa.cpp configfile.cpp re2nfa.cpp
INCLUDEPATH += $$PWD

6
util/lexgen/lexgen.pro Normal file
View File

@ -0,0 +1,6 @@
TEMPLATE = app
TARGET = lexgen
include(lexgen.pri)
SOURCES += main.cpp \
generator.cpp
QT = core

285
util/lexgen/main.cpp Normal file
View File

@ -0,0 +1,285 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "nfa.h"
#include "re2nfa.h"
#include "configfile.h"
#include "generator.h"
#include <QFile>
#include <QCoreApplication>
#include <QFileInfo>
#include <QDateTime>
struct Symbol
{
QString token;
QString lexem;
};
static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
{
QList<Symbol> symbols;
Symbol lastSymbol;
int state = 0;
int lastAcceptingState = -1;
QString lastAcceptingLexem;
int lastAcceptingPos = -1;
for (int i = 0; i < input.length(); ++i) {
QChar ch = input.at(i);
QChar chForInput = ch;
if (cfg->caseSensitivity == Qt::CaseInsensitive)
chForInput = chForInput.toLower();
int next = dfa.at(state).transitions.value(chForInput.unicode());
if (cfg->debug)
qDebug() << "input" << input.at(i) << "leads to state" << next;
if (next) {
lastSymbol.lexem.append(input.at(i));
lastSymbol.token = dfa.at(next).symbol;
if (!lastSymbol.token.isEmpty()) {
lastAcceptingState = next;
lastAcceptingLexem = lastSymbol.lexem;
lastAcceptingPos = i;
}
state = next;
} else {
if (lastAcceptingState != -1) {
if (cfg->debug)
qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
Symbol s;
s.token = dfa.at(lastAcceptingState).symbol;
s.lexem = lastAcceptingLexem;
symbols << s;
lastSymbol = Symbol();
state = 0;
i = lastAcceptingPos;
lastAcceptingPos = -1;
lastAcceptingState = -1;
continue;
}
if (state == 0 || lastSymbol.token.isEmpty()) {
if (cfg->debug)
qDebug() << "invalid input";
if (ok)
*ok = false;
return symbols;
}
if (cfg->debug)
qDebug() << "appending symbol with token" << lastSymbol.token;
symbols << lastSymbol;
lastSymbol = Symbol();
state = 0;
lastAcceptingState = -1;
--i;
}
}
if (!lastSymbol.token.isEmpty()) {
if (cfg->debug)
qDebug() << "appending (last) symbol with token" << lastSymbol.token;
symbols << lastSymbol;
} else if (lastAcceptingState != -1) {
if (cfg->debug)
qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
Symbol s;
s.lexem = lastAcceptingLexem;
s.token = dfa.at(lastAcceptingState).symbol;
symbols << s;
}
if (ok)
*ok = true;
return symbols;
}
static QSet<InputType> determineMaxInputSet(const ConfigFile::Section &section)
{
QSet<InputType> set;
QString inputTypeName;
foreach (const ConfigFile::Entry &entry, section)
if (entry.key == QLatin1String("InputType")) {
if (!inputTypeName.isEmpty()) {
qWarning("Error: InputType field specified multiple times in config file");
return QSet<InputType>();
}
inputTypeName = entry.value;
}
if (inputTypeName.isEmpty())
inputTypeName = "quint8";
if (inputTypeName == "quint8") {
for (int i = 1; i < 256; ++i)
set.insert(i);
} /* else if ### */
else {
qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
return QSet<InputType>();
}
return set;
}
static bool loadConfig(const QString &ruleFile, Config *cfg)
{
ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
if (sections.isEmpty()) {
qWarning("Error parsing %s", qPrintable(ruleFile));
return false;
}
QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
if (maxInputSet.isEmpty())
return false;
Qt::CaseSensitivity cs = Qt::CaseInsensitive;
if (sections.value("Options").contains("case-sensitive"))
cs = Qt::CaseSensitive;
cfg->configSections = sections;
cfg->caseSensitivity = cs;
cfg->className = sections.value("Options").value("classname", "Scanner");
cfg->maxInputSet = maxInputSet;
cfg->ruleFile = ruleFile;
return true;
}
static DFA generateMachine(const Config &cfg)
{
if (cfg.cache) {
QFileInfo ruleInfo(cfg.ruleFile);
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
if (cacheInfo.exists()
&& cacheInfo.lastModified() > ruleInfo.lastModified()) {
QFile f(cacheInfo.absoluteFilePath());
f.open(QIODevice::ReadOnly);
QDataStream stream(&f);
DFA machine;
stream >> machine;
return machine;
}
}
QMap<QString, NFA> macros;
foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
int errCol = 0;
if (cfg.debug)
qDebug() << "parsing" << e.value;
NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
if (nfa.isEmpty()) {
qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
return DFA();
}
macros.insert(e.key, nfa);
}
if (!cfg.configSections.contains("Tokens")) {
qWarning("Rule file does not contain a [Tokens] section!");
return DFA();
}
QList<NFA> tokens;
foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
int errCol = 0;
if (cfg.debug)
qDebug() << "parsing" << e.value;
NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
if (tok.isEmpty()) {
qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
return DFA();
}
tok.setTerminationSymbol(e.key);
tokens.append(tok);
}
NFA giganticStateMachine;
foreach (NFA nfa, tokens)
if (giganticStateMachine.isEmpty())
giganticStateMachine = nfa;
else
giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
DFA result = giganticStateMachine.toDFA().minimize();
if (cfg.cache) {
QFileInfo ruleInfo(cfg.ruleFile);
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
QFile f(cacheInfo.absoluteFilePath());
f.open(QIODevice::WriteOnly | QIODevice::Truncate);
QDataStream stream(&f);
stream << result;
}
return result;
}
#if !defined(AUTOTEST)
int main(int argc, char **argv)
{
QCoreApplication app(argc, argv);
QString ruleFile;
Config cfg;
const QStringList arguments = app.arguments().mid(1);
cfg.debug = arguments.contains("-debug");
const bool testRules = arguments.contains("-test");
cfg.cache = arguments.contains("-cache");
foreach (const QString &arg, arguments)
if (!arg.startsWith(QLatin1Char('-'))) {
ruleFile = arg;
break;
}
if (ruleFile.isEmpty()) {
qWarning("usage: lexgen [-debug] [-cache] [-test] rulefile");
qWarning(" ");
qWarning(" the -test option will cause lexgen to interpret standard input");
qWarning(" according to the specified rules and print out pairs of token and");
qWarning(" lexical element");
return 1;
}
if (!loadConfig(ruleFile, &cfg))
return 1;
DFA machine = generateMachine(cfg);
if (machine.isEmpty())
return 1;
if (testRules) {
qWarning("Testing:");
QString input = QTextStream(stdin).readAll();
/*
qDebug() << "NFA has" << machine.stateCount() << "states";
qDebug() << "Converting to DFA... (this may take a while)";
DFA dfa = machine.toDFA();
qDebug() << "DFA has" << dfa.count() << "states";
qDebug() << "Minimizing...";
dfa = dfa.minimize();
qDebug() << "Minimized DFA has" << dfa.count() << "states";
*/
DFA dfa = machine;
if (cfg.debug)
qDebug() << "tokenizing" << input;
bool ok = false;
QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
if (symbols.isEmpty()) {
qWarning("No tokens produced!");
} else {
foreach (Symbol s, symbols)
qDebug() << s.token << ":" << s.lexem;
}
if (ok)
qDebug() << symbols.count() << "tokens produced.";
else
qDebug() << "Error while tokenizing!";
} else {
Generator gen(machine, cfg);
QTextStream(stdout)
<< gen.generate();
}
return 0;
}
#endif

466
util/lexgen/nfa.cpp Normal file
View File

@ -0,0 +1,466 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "nfa.h"
#include <QSet>
#include <limits.h>
NFA NFA::createSingleInputNFA(InputType input)
{
NFA result;
result.initialize(2);
result.addTransition(result.initialState, input, result.finalState);
return result;
}
NFA NFA::createSymbolNFA(const QString &symbol)
{
NFA result = NFA::createSingleInputNFA(Epsilon);
result.states[result.finalState].symbol = symbol;
return result;
}
void NFA::initialize(int size)
{
states.resize(size);
states.fill(State());
initialState = 0;
finalState = size - 1;
}
void NFA::addTransition(int from, InputType input, int to)
{
assertValidState(from);
assertValidState(to);
states[from].transitions.insertMulti(input, to);
}
void NFA::copyFrom(const NFA &other, int baseState)
{
assertValidState(baseState);
assertValidState(baseState + other.states.count() - 1);
for (int i = 0; i < other.states.count(); ++i) {
State s = other.states.at(i);
for (TransitionMap::Iterator it = s.transitions.begin(),
end = s.transitions.end(); it != end; ++it)
*it += baseState;
states[baseState + i] = s;
}
}
void NFA::initializeFromPair(const NFA &a, const NFA &b,
int *initialA, int *finalA,
int *initialB, int *finalB)
{
initialize(a.states.count() + b.states.count() + 2);
int baseIdxA = 1;
int baseIdxB = 1 + a.states.count();
*initialA = a.initialState + baseIdxA;
*finalA = a.finalState + baseIdxA;
*initialB = b.initialState + baseIdxB;
*finalB = b.finalState + baseIdxB;
copyFrom(a, baseIdxA);
copyFrom(b, baseIdxB);
}
NFA NFA::createAlternatingNFA(const NFA &a, const NFA &b)
{
NFA result;
int newInitialA, newFinalA,
newInitialB, newFinalB;
result.initializeFromPair(a, b, &newInitialA, &newFinalA,
&newInitialB, &newFinalB);
result.addTransition(result.initialState, Epsilon, newInitialA);
result.addTransition(result.initialState, Epsilon, newInitialB);
result.addTransition(newFinalA, Epsilon, result.finalState);
result.addTransition(newFinalB, Epsilon, result.finalState);
return result;
}
NFA NFA::createConcatenatingNFA(const NFA &a, const NFA &b)
{
NFA result;
int initialA, finalA,
initialB, finalB;
result.initializeFromPair(a, b, &initialA, &finalA, &initialB, &finalB);
result.addTransition(result.initialState, Epsilon, initialA);
result.addTransition(finalA, Epsilon, initialB);
result.addTransition(finalB, Epsilon, result.finalState);
return result;
}
NFA NFA::createOptionalNFA(const NFA &a)
{
NFA result;
result.initialize(a.states.count() + 2);
int baseIdxA = 1;
int initialA = a.initialState + baseIdxA;
int finalA = a.finalState + baseIdxA;
result.copyFrom(a, baseIdxA);
result.addTransition(result.initialState, Epsilon, initialA);
result.addTransition(result.initialState, Epsilon, result.finalState);
result.addTransition(finalA, Epsilon, initialA);
result.addTransition(finalA, Epsilon, result.finalState);
return result;
}
NFA NFA::createStringNFA(const QByteArray &str)
{
NFA result;
foreach (char c, str) {
NFA ch = NFA::createSingleInputNFA(c);
if (result.isEmpty())
result = ch;
else
result = NFA::createConcatenatingNFA(result, ch);
}
return result;
}
NFA NFA::createSetNFA(const QSet<InputType> &set)
{
NFA result;
result.initialize(set.count() + 2);
int state = 1;
for (QSet<InputType>::ConstIterator it = set.constBegin(), end = set.constEnd();
it != end; ++it, ++state) {
result.addTransition(result.initialState, Epsilon, state);
result.addTransition(state, *it, result.finalState);
}
/*
foreach (InputType input, set) {
NFA ch = NFA::createSingleInputNFA(input);
if (result.isEmpty())
result = ch;
else
result = NFA::createAlternatingNFA(result, ch);
}
*/
return result;
}
NFA NFA::createZeroOrOneNFA(const NFA &a)
{
NFA epsilonNFA = createSingleInputNFA(Epsilon);
return NFA::createAlternatingNFA(a, epsilonNFA);
}
NFA NFA::applyQuantity(const NFA &a, int minOccurrences, int maxOccurrences)
{
NFA result = a;
NFA epsilonNFA = createSingleInputNFA(Epsilon);
if (minOccurrences == 0) {
result = NFA::createAlternatingNFA(result, epsilonNFA);
} else {
minOccurrences--;
}
maxOccurrences--;
for (int i = 0; i < minOccurrences; ++i)
result = NFA::createConcatenatingNFA(result, a);
for (int i = minOccurrences; i < maxOccurrences; ++i)
result = NFA::createConcatenatingNFA(result, NFA::createAlternatingNFA(a, epsilonNFA));
return result;
}
void NFA::debug()
{
qDebug() << "NFA has" << states.count() << "states";
qDebug() << "initial state is" << initialState;
qDebug() << "final state is" << finalState;
for (int i = 0; i < states.count(); ++i) {
const State &s = states.at(i);
for (TransitionMap::ConstIterator it = s.transitions.constBegin(),
end = s.transitions.constEnd(); it != end; ++it)
qDebug() << "transition from state" << i << "to" << it.value() << "through"
<< (it.key() == Epsilon ? QString("Epsilon") : QString(char(it.key())));
if (!s.symbol.isEmpty())
qDebug() << "State" << i << "leads to symbol" << s.symbol;
}
}
// helper
typedef QSet<int> DFAState;
// that's a bad hash, but it's good enough for us
// and it allows us to use the nice QHash API :)
inline size_t qHash(const DFAState &state)
{
size_t val = 0;
foreach (int s, state)
val ^= qHash(s);
return val;
}
DFA NFA::toDFA() const
{
DFA result;
result.reserve(states.count());
QHash<QString, int> symbolReferenceCounts;
{
QSet<int> symbolStates;
for (int i = 0; i < states.count(); ++i)
if (!states.at(i).symbol.isEmpty())
symbolStates.insert(i);
QHash<int, QString> epsilonStates;
for (int i = 0; i < states.count(); ++i) {
const State &s = states.at(i);
for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd();
transition != end; ++transition)
if (transition.key() == Epsilon && symbolStates.contains(transition.value()))
epsilonStates.insert(i, states.at(transition.value()).symbol);
}
int lastCount;
do {
lastCount = epsilonStates.count();
for (int i = 0; i < states.count(); ++i) {
const State &s = states.at(i);
for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd();
transition != end; ++transition)
if (transition.key() == Epsilon && epsilonStates.contains(transition.value()))
epsilonStates.insert(i, epsilonStates.value(transition.value()));
}
} while (lastCount != epsilonStates.count());
for (int i = 0; i < states.count(); ++i) {
const State &s = states.at(i);
for (TransitionMap::ConstIterator transition = s.transitions.constBegin(), end = s.transitions.constEnd();
transition != end; ++transition) {
if (transition.key() == Epsilon)
continue;
if (symbolStates.contains(transition.value())) {
const QString symbol = states.at(transition.value()).symbol;
symbolReferenceCounts[symbol]++;
} else if (epsilonStates.contains(transition.value())) {
const QString symbol = epsilonStates.value(transition.value());
symbolReferenceCounts[symbol]++;
}
}
}
/*
for (QHash<QString, int>::ConstIterator symIt = symbolReferenceCounts.constBegin(), symEnd = symbolReferenceCounts.constEnd();
symIt != symEnd; ++symIt)
qDebug() << "symbol" << symIt.key() << "is reached" << symIt.value() << "times";
*/
}
QSet<InputType> validInput;
foreach (const State &s, states)
for (TransitionMap::ConstIterator it = s.transitions.constBegin(),
end = s.transitions.constEnd(); it != end; ++it)
if (it.key() != Epsilon)
validInput.insert(it.key());
// A DFA state can consist of multiple NFA states.
// the dfaStateMap maps from these to the actual
// state index within the resulting DFA vector
QHash<DFAState, int> dfaStateMap;
QStack<DFAState> pendingDFAStates;
DFAState startState = epsilonClosure(QSet<int>() << initialState);
result.resize(1);
dfaStateMap.insert(startState, 0);
pendingDFAStates.push(startState);
while (!pendingDFAStates.isEmpty()) {
DFAState state = pendingDFAStates.pop();
// qDebug() << "processing" << state << "from the stack of pending states";
foreach (InputType input, validInput) {
QSet<int> reachableStates;
foreach (int nfaState, state) {
const TransitionMap &transitions = states.at(nfaState).transitions;
TransitionMap::ConstIterator it = transitions.find(input);
while (it != transitions.constEnd() && it.key() == input) {
reachableStates.insert(it.value());
++it;
}
}
if (reachableStates.isEmpty())
continue;
// qDebug() << "can reach" << reachableStates << "from input" << char(input);
QSet<int> closure = epsilonClosure(reachableStates);
// qDebug() << "closure is" << closure;
if (!dfaStateMap.contains(closure)) {
int dfaState = result.count();
result.append(State());
QString symbol;
int refCount = INT_MAX;
foreach (int nfaState, closure)
if (!states.at(nfaState).symbol.isEmpty()) {
// qDebug() << "closure also contains symbol" << states.at(nfaState).symbol;
QString candidate = states.at(nfaState).symbol;
int candidateRefCount =symbolReferenceCounts.value(candidate, INT_MAX);
if (candidateRefCount < refCount) {
refCount = candidateRefCount;
symbol = candidate;
}
}
if (!symbol.isEmpty())
result.last().symbol = symbol;
dfaStateMap.insert(closure, dfaState);
Q_ASSERT(!pendingDFAStates.contains(closure));
pendingDFAStates.prepend(closure);
}
result[dfaStateMap.value(state)].transitions.insert(input, dfaStateMap.value(closure));
}
}
return result;
}
QSet<int> NFA::epsilonClosure(const QSet<int> &initialClosure) const
{
QSet<int> closure = initialClosure;
closure.reserve(closure.count() * 4);
QStack<int> stateStack;
stateStack.resize(closure.count());
std::copy(closure.constBegin(), closure.constEnd(), stateStack.begin());
while (!stateStack.isEmpty()) {
int t = stateStack.pop();
const TransitionMap &transitions = states.at(t).transitions;
TransitionMap::ConstIterator it = transitions.find(Epsilon);
while (it != transitions.constEnd() && it.key() == Epsilon) {
const int u = it.value();
if (!closure.contains(u)) {
closure.insert(u);
stateStack.push(u);
}
++it;
}
}
return closure;
}
void NFA::setTerminationSymbol(const QString &symbol)
{
states[finalState].symbol = symbol;
}
void DFA::debug() const
{
qDebug() << "DFA has" << count() << "states";
for (int i = 0; i < count(); ++i) {
const State &s = at(i);
if (s.transitions.isEmpty()) {
qDebug() << "State" << i << "has no transitions";
} else {
for (TransitionMap::ConstIterator it = s.transitions.constBegin(),
end = s.transitions.constEnd(); it != end; ++it)
qDebug() << "transition from state" << i << "to" << it.value() << "through"
<< (it.key() == Epsilon ? QString("Epsilon") : QString(char(it.key())));
}
if (!s.symbol.isEmpty())
qDebug() << "State" << i << "leads to symbol" << s.symbol;
}
}
DFA DFA::minimize() const
{
QList<bool> inequivalentStates(count() * count());
inequivalentStates.fill(false);
for (int i = 0; i < count(); ++i)
for (int j = 0; j < i; ++j) {
if (i != j && at(i).symbol != at(j).symbol)
inequivalentStates[i * count() + j] = true;
}
bool done;
do {
done = true;
for (int i = 0; i < count(); ++i)
for (int j = 0; j < count(); ++j) {
if (i == j)
continue;
if (inequivalentStates[i * count() + j])
continue;
if (at(i).transitions.keys() != at(j).transitions.keys()) {
inequivalentStates[i * count() + j] = true;
done = false;
continue;
}
foreach (InputType a, at(i).transitions.keys()) {
int r = at(i).transitions.value(a, -1);
if (r == -1)
continue;
int s = at(j).transitions.value(a, -1);
if (s == -1)
continue;
if (inequivalentStates[r * count() + s]
|| r == s) {
inequivalentStates[i * count() + j] = true;
done = false;
break;
}
}
}
} while (!done);
QHash<int, int> statesToEliminate;
for (int i = 0; i < count(); ++i)
for (int j = 0; j < i; ++j)
if (!inequivalentStates[i * count() + j]) {
statesToEliminate.insertMulti(i, j);
}
/*
qDebug() << "states to eliminiate:" << statesToEliminate.count();;
qDebug() << "merging" << statesToEliminate;
debug();
*/
return *this;
}

89
util/lexgen/nfa.h Normal file
View File

@ -0,0 +1,89 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#ifndef NFA_H
#define NFA_H
#include <QByteArray>
#include <QDebug>
#include <QHash>
#include <QList>
#include <QMap>
#include <QStack>
#include <QString>
#include "global.h"
typedef QHash<InputType, int> TransitionMap;
struct State
{
QString symbol;
TransitionMap transitions;
};
inline QDataStream &operator<<(QDataStream &stream, const State &state)
{
return stream << state.symbol << state.transitions;
}
inline QDataStream &operator>>(QDataStream &stream, State &state)
{
return stream >> state.symbol >> state.transitions;
}
struct DFA : public QList<State>
{
void debug() const;
DFA minimize() const;
};
class NFA
{
public:
static NFA createSingleInputNFA(InputType input);
static NFA createSymbolNFA(const QString &symbol);
static NFA createAlternatingNFA(const NFA &a, const NFA &b);
static NFA createConcatenatingNFA(const NFA &a, const NFA &b);
static NFA createOptionalNFA(const NFA &a);
// convenience
static NFA createStringNFA(const QByteArray &str);
static NFA createSetNFA(const QSet<InputType> &set);
static NFA createZeroOrOneNFA(const NFA &a);
static NFA applyQuantity(const NFA &a, int minOccurrences, int maxOccurrences);
void setTerminationSymbol(const QString &symbol);
DFA toDFA() const;
inline bool isEmpty() const { return states.isEmpty(); }
inline int stateCount() const { return states.count(); }
void debug();
private:
void initialize(int size);
void addTransition(int from, InputType input, int to);
void copyFrom(const NFA &other, int baseState);
void initializeFromPair(const NFA &a, const NFA &b,
int *initialA, int *finalA,
int *initialB, int *finalB);
QSet<int> epsilonClosure(const QSet<int> &initialClosure) const;
inline void assertValidState(int state)
{ Q_UNUSED(state); Q_ASSERT(state >= 0); Q_ASSERT(state < states.count()); }
#if defined(AUTOTEST)
public:
#endif
int initialState;
int finalState;
QList<State> states;
};
#endif // NFA_H

509
util/lexgen/re2nfa.cpp Normal file
View File

@ -0,0 +1,509 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include "re2nfa.h"
#include "tokenizer.cpp"
RE2NFA::RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
: macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
{
}
NFA RE2NFA::parse(const QString &expression, int *errCol)
{
tokenize(expression);
if (symbols.isEmpty())
return NFA();
index = 0;
NFA result = parseExpr();
if (result.isEmpty()) {
if (errCol)
*errCol = errorColumn;
}
return result;
}
NFA RE2NFA::parseExpr()
{
NFA value = parseBranch();
while (test(TOK_OR)) {
NFA rhs = parseBranch();
value = NFA::createAlternatingNFA(value, rhs);
}
return value;
}
NFA RE2NFA::parseBranch()
{
NFA value = parsePiece();
if (!hasNext())
return value;
NFA next;
do {
next = parsePiece();
if (!next.isEmpty())
value = NFA::createConcatenatingNFA(value, next);
} while (!next.isEmpty() && hasNext());
return value;
}
NFA RE2NFA::parsePiece()
{
NFA atom = parseAtom();
if (atom.isEmpty() || !hasNext())
return atom;
return parseMaybeQuantifier(atom);
}
NFA RE2NFA::parseAtom()
{
// ####
switch (next()) {
case TOK_STRING:
return createCharNFA();
case TOK_LPAREN: {
NFA subExpr = parseExpr();
next(TOK_RPAREN);
return subExpr;
}
case TOK_LBRACE: {
QString macroName = lexemUntil(TOK_RBRACE);
QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
if (macro == macros.end()) {
qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
return NFA();
}
return *macro;
}
case TOK_LBRACKET: {
NFA set = parseSet();
next(TOK_RBRACKET);
return set;
}
case TOK_SEQUENCE:
return parseSet2();
case TOK_DOT:
return NFA::createSetNFA(maxInputSet);
default:
prev();
return NFA();
}
}
NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
{
// ####
switch (next()) {
case TOK_STAR:
return NFA::createOptionalNFA(nfa);
case TOK_QUESTION:
return NFA::createZeroOrOneNFA(nfa);
case TOK_PLUS:
return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
case TOK_LBRACE: {
const int rewind = index - 1;
QString lexemBeforeComma;
QString lexemAfterComma;
bool seenComma = false;
forever {
if (test(TOK_COMMA)) {
if (seenComma) {
errorColumn = symbol().column;
return NFA();
}
seenComma = true;
} else if (test(TOK_RBRACE)) {
break;
} else {
next(TOK_STRING);
if (seenComma)
lexemAfterComma += symbol().lexem;
else
lexemBeforeComma += symbol().lexem;
}
}
bool isNumber = false;
int min = lexemBeforeComma.toInt(&isNumber);
if (!isNumber) {
index = rewind;
return nfa;
}
int max = min;
if (seenComma) {
max = lexemAfterComma.toInt(&isNumber);
if (!isNumber) {
errorColumn = symbol().column;
return NFA();
}
}
return NFA::applyQuantity(nfa, min, max);
}
default:
prev();
return nfa;
}
}
NFA RE2NFA::parseSet()
{
QSet<InputType> set;
bool negate = false;
next(TOK_STRING);
do {
Q_ASSERT(symbol().lexem.length() == 1);
// ###
QChar ch = symbol().lexem.at(0);
if (set.isEmpty() && ch == QLatin1Char('^')) {
negate = true;
continue;
}
// look ahead for ranges like a-z
bool rangeFound = false;
if (test(TOK_STRING)) {
if (symbol().lexem.length() == 1
&& symbol().lexem.at(0) == QLatin1Char('-')) {
next(TOK_STRING);
Q_ASSERT(symbol().lexem.length() == 1);
QChar last = symbol().lexem.at(0);
if (ch.unicode() > last.unicode())
qSwap(ch, last);
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(QChar(i).toLower().unicode());
} else {
set.insert(i);
}
}
rangeFound = true;
} else {
prev();
}
}
if (!rangeFound) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(ch.toLower().unicode());
} else {
set.insert(ch.unicode());
}
}
} while (test(TOK_STRING));
if (negate) {
QSet<InputType> negatedSet = maxInputSet;
negatedSet.subtract(set);
set = negatedSet;
}
return NFA::createSetNFA(set);
}
NFA RE2NFA::parseSet2()
{
QSet<InputType> set;
bool negate = false;
QString str = symbol().lexem;
// strip off brackets
str.chop(1);
str.remove(0, 1);
int i = 0;
while (i < str.length()) {
// ###
QChar ch = str.at(i++);
if (set.isEmpty() && ch == QLatin1Char('^')) {
negate = true;
continue;
}
// look ahead for ranges like a-z
bool rangeFound = false;
if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
++i;
QChar last = str.at(i++);
if (ch.unicode() > last.unicode())
qSwap(ch, last);
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(QChar(i).toLower().unicode());
} else {
set.insert(i);
}
}
rangeFound = true;
}
if (!rangeFound) {
if (caseSensitivity == Qt::CaseInsensitive) {
set.insert(ch.toLower().unicode());
} else {
set.insert(ch.unicode());
}
}
}
if (negate) {
QSet<InputType> negatedSet = maxInputSet;
negatedSet.subtract(set);
set = negatedSet;
}
return NFA::createSetNFA(set);
}
NFA RE2NFA::createCharNFA()
{
NFA nfa;
// ####
if (caseSensitivity == Qt::CaseInsensitive) {
nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
} else {
nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
}
return nfa;
}
static inline int skipQuote(const QString &str, int pos)
{
while (pos < str.length()
&& str.at(pos) != QLatin1Char('"')) {
if (str.at(pos) == QLatin1Char('\\')) {
++pos;
if (pos >= str.length())
break;
}
++pos;
}
if (pos < str.length())
++pos;
return pos;
}
#if 0
static const char*tokStr(Token t)
{
switch (t) {
case TOK_INVALID: return "TOK_INVALID";
case TOK_STRING: return "TOK_STRING";
case TOK_LBRACE: return "TOK_LBRACE";
case TOK_RBRACE: return "TOK_RBRACE";
case TOK_LBRACKET: return "TOK_LBRACKET";
case TOK_RBRACKET: return "TOK_RBRACKET";
case TOK_LPAREN: return "TOK_LPAREN";
case TOK_RPAREN: return "TOK_RPAREN";
case TOK_COMMA: return "TOK_COMMA";
case TOK_STAR: return "TOK_STAR";
case TOK_OR: return "TOK_OR";
case TOK_QUESTION: return "TOK_QUESTION";
case TOK_DOT: return "TOK_DOT";
case TOK_PLUS: return "TOK_PLUS";
case TOK_SEQUENCE: return "TOK_SEQUENCE";
case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
}
return "";
}
#endif
void RE2NFA::tokenize(const QString &input)
{
symbols.clear();
#if 1
RegExpTokenizer tokenizer(input);
Symbol sym;
int tok = tokenizer.lex();
while (tok != -1) {
Symbol sym;
sym.token = static_cast<Token>(tok);
sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
if (sym.token == TOK_QUOTED_STRING) {
sym.lexem.chop(1);
sym.lexem.remove(0, 1);
sym.token = TOK_STRING;
}
if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
for (int i = 0; i < sym.lexem.length(); ++i) {
if (sym.lexem.at(i) == '\\') {
if (i >= sym.lexem.length() - 1)
break;
QChar ch = sym.lexem.at(i + 1);
if (ch == QLatin1Char('n')) {
ch = '\n';
} else if (ch == QLatin1Char('r')) {
ch = '\r';
} else if (ch == QLatin1Char('t')) {
ch = '\t';
} else if (ch == QLatin1Char('f')) {
ch = '\f';
}
sym.lexem.replace(i, 2, ch);
}
}
}
/*
if (sym.token == TOK_SEQUENCE) {
Symbol s;
s.token = TOK_LBRACKET;
s.lexem = "[";
symbols.append(s);
for (int i = 1; i < sym.lexem.length() - 1; ++i) {
s.token = TOK_STRING;
s.lexem = sym.lexem.at(i);
symbols.append(s);
}
s.token = TOK_RBRACKET;
s.lexem = "]";
symbols.append(s);
tok = tokenizer.lex();
continue;
}
*/
symbols.append(sym);
tok = tokenizer.lex();
}
#else
int pos = 0;
bool insideSet = false;
while (pos < input.length()) {
QChar ch = input.at(pos);
Symbol sym;
sym.column = pos;
sym.token = TOK_INVALID;
sym.lexem = QString(ch);
switch (ch.toLatin1()) {
case '"': {
if (insideSet) {
sym.token = TOK_STRING;
sym.lexem = QString(ch);
symbols += sym;
++pos;
continue;
}
if (pos + 1 >= input.length())
return;
int quoteEnd = skipQuote(input, pos + 1);
sym.token = TOK_STRING;
sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
symbols += sym;
pos = quoteEnd;
continue;
}
case '{':
sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
break;
case '}':
sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
break;
case '[':
insideSet = true;
sym.token = TOK_LBRACKET;
break;
case ']':
insideSet = false;
sym.token = TOK_RBRACKET;
break;
case '(':
sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
break;
case ')':
sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
break;
case ',':
sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
break;
case '*':
sym.token = (insideSet ? TOK_STRING : TOK_STAR);
break;
case '|':
sym.token = (insideSet ? TOK_STRING : TOK_OR);
break;
case '?':
sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
break;
case '.':
sym.token = (insideSet ? TOK_STRING : TOK_DOT);
break;
case '+':
sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
break;
case '\\':
++pos;
if (pos >= input.length())
return;
ch = input.at(pos);
if (ch == QLatin1Char('n')) {
ch = '\n';
} else if (ch == QLatin1Char('r')) {
ch = '\r';
} else if (ch == QLatin1Char('t')) {
ch = '\t';
} else if (ch == QLatin1Char('f')) {
ch = '\f';
}
// fall through
default:
sym.token = TOK_STRING;
sym.lexem = QString(ch);
symbols += sym;
++pos;
continue;
}
symbols += sym;
++pos;
}
#endif
#if 0
foreach (Symbol s, symbols) {
qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
}
#endif
}
bool RE2NFA::next(Token t)
{
if (hasNext() && next() == t)
return true;
errorColumn = symbol().column;
Q_ASSERT(false);
return false;
}
bool RE2NFA::test(Token t)
{
if (index >= symbols.count())
return false;
if (symbols.at(index).token == t) {
++index;
return true;
}
return false;
}
QString RE2NFA::lexemUntil(Token t)
{
QString lexem;
while (hasNext() && next() != t)
lexem += symbol().lexem;
return lexem;
}

78
util/lexgen/re2nfa.h Normal file
View File

@ -0,0 +1,78 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#ifndef RE2NFA_H
#define RE2NFA_H
#include "nfa.h"
#include <QSet>
class RE2NFA
{
public:
RE2NFA(const QMap<QString, NFA> &macros, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs);
NFA parse(const QString &expression, int *errorColumn = 0);
private:
NFA parseExpr();
NFA parseBranch();
NFA parsePiece();
NFA parseAtom();
NFA parseMaybeQuantifier(const NFA &nfa);
NFA parseSet();
NFA parseSet2();
NFA createCharNFA();
private:
friend class RegExpTokenizer;
enum Token {
TOK_INVALID,
TOK_STRING,
TOK_LBRACE, // {
TOK_RBRACE, // }
TOK_LBRACKET, // [
TOK_RBRACKET, // ]
TOK_LPAREN, // (
TOK_RPAREN, // )
TOK_COMMA,
TOK_STAR,
TOK_OR,
TOK_QUESTION,
TOK_DOT,
TOK_PLUS,
TOK_SEQUENCE,
TOK_QUOTED_STRING
};
struct Symbol
{
inline Symbol() : token(TOK_INVALID), column(-1) {}
inline Symbol(Token t, const QString &l = QString()) : token(t), lexem(l), column(-1) {}
Token token;
QString lexem;
int column;
};
inline bool hasNext() const { return index < symbols.count(); }
inline Token next() { return symbols.at(index++).token; }
bool next(Token t);
bool test(Token t);
inline void prev() { index--; }
inline const Symbol &symbol() const { return symbols.at(index - 1); }
QString lexemUntil(Token t);
void tokenize(const QString &input);
QMap<QString, NFA> macros;
QList<Symbol> symbols;
int index;
int errorColumn;
const QSet<InputType> maxInputSet;
Qt::CaseSensitivity caseSensitivity;
};
#endif // RE2NFA_H

9
util/lexgen/test.lexgen Normal file
View File

@ -0,0 +1,9 @@
[Options]
case-insensitive
classname = TestScanner
[Tokens]
TOK_C = [abcd]
TOK_B = [bc]
TOK_A = a

View File

@ -0,0 +1 @@
LETX

View File

@ -0,0 +1 @@
TOK_LET|LET

View File

@ -0,0 +1,3 @@
[Tokens]
TOK_LET = LET
TOK_LETXX = LETXX

View File

@ -0,0 +1 @@
LETXTRA

View File

@ -0,0 +1,2 @@
TOK_LET|LET
TOK_XTRA|XTRA

View File

@ -0,0 +1,4 @@
[Tokens]
TOK_LET = LET
TOK_LETXX = LETXX
TOK_XTRA = XTRA

View File

@ -0,0 +1 @@
abcdAbcDABCDeFgEFGefgEfghiHIHihI

View File

@ -0,0 +1,14 @@
TOK_AB|ab
TOK_CD|cd
TOK_AB|Ab
TOK_CD|cD
TOK_AB|AB
TOK_CD|CD
TOK_EFG|eFg
TOK_EFG|EFG
TOK_EFG|efg
TOK_EFG|Efg
TOK_HI|hi
TOK_HI|HI
TOK_HI|Hi
TOK_HI|hI

View File

@ -0,0 +1,7 @@
[Options]
case-insensitive
[Tokens]
TOK_AB = ab
TOK_CD = cd
TOK_EFG = [e-g]{3}
TOK_HI = [hi]{2}

View File

@ -0,0 +1 @@
/* comment with stuff *//*another comment with * stars * inside*/

View File

@ -0,0 +1,2 @@
TOK_COMMENT|/* comment with stuff */
TOK_COMMENT|/*another comment with * stars * inside*/

View File

@ -0,0 +1,2 @@
[Tokens]
TOK_COMMENT = \/\*[^*]*\*+([^/*][^*]*\*+)*\/

1
util/lexgen/tests/testdata/dot/input vendored Normal file
View File

@ -0,0 +1 @@
afbcxd

2
util/lexgen/tests/testdata/dot/output vendored Normal file
View File

@ -0,0 +1,2 @@
TOK_AB|afb
TOK_CD|cxd

View File

@ -0,0 +1,3 @@
[Tokens]
TOK_AB = a.b
TOK_CD = c.d

View File

@ -0,0 +1 @@
aycabd

View File

@ -0,0 +1,2 @@
TOK_A|ayc
TOK_B|abd

View File

@ -0,0 +1,3 @@
[Tokens]
TOK_A = a[^b]c
TOK_B = abd

View File

@ -0,0 +1 @@
"a

View File

@ -0,0 +1 @@
TOK_QUOTEA|"a

View File

@ -0,0 +1,2 @@
[Tokens]
TOK_QUOTEA = ["]a

View File

@ -0,0 +1 @@
quotedstring

View File

@ -0,0 +1 @@
TOK_STR|quotedstring

View File

@ -0,0 +1,2 @@
[Tokens]
TOK_STR = "quotedstring"

View File

@ -0,0 +1 @@
abcd

View File

@ -0,0 +1,2 @@
TOK_AB|ab
TOK_CD|cd

View File

@ -0,0 +1,3 @@
[Tokens]
TOK_AB = ab
TOK_CD = cd

View File

@ -0,0 +1 @@
abaf

View File

@ -0,0 +1,2 @@
TOK_AB|ab
TOK_AZ|af

View File

@ -0,0 +1,3 @@
[Tokens]
TOK_AB = ab
TOK_AZ = a[a-z]

View File

@ -0,0 +1 @@
abd

View File

@ -0,0 +1,3 @@
TOK_A|a
TOK_B|b
TOK_D|d

View File

@ -0,0 +1,4 @@
[Tokens]
TOK_D = [abcd]
TOK_B = [bc]
TOK_A = a

View File

@ -0,0 +1,5 @@
SOURCES += tst_lexgen.cpp
TARGET = tst_lexgen
include(../lexgen.pri)
QT = core testlib
DEFINES += SRCDIR=\\\"$$PWD\\\"

View File

@ -0,0 +1,247 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
#include <QTest>
#define AUTOTEST
#include "../main.cpp"
class tst_LexGen : public QObject
{
Q_OBJECT
private slots:
void nfa_singleInput();
void nfa_alternating();
void nfa_concatenating();
void nfa_optional();
void nfa_toDFA_data();
void nfa_toDFA();
void lexgen_data();
void lexgen();
};
void tst_LexGen::nfa_singleInput()
{
NFA nfa = NFA::createSingleInputNFA('a');
QCOMPARE(nfa.initialState, 0);
QCOMPARE(nfa.finalState, 1);
QCOMPARE(nfa.states.count(), 2);
QCOMPARE(nfa.states.at(0).transitions.count(), 1);
QVERIFY(nfa.states.at(0).transitions.contains('a'));
QCOMPARE(nfa.states.at(0).transitions.values('a').count(), 1);
QCOMPARE(nfa.states.at(0).transitions.value('a'), nfa.finalState);
QVERIFY(nfa.states.at(1).transitions.isEmpty());
}
void tst_LexGen::nfa_alternating()
{
NFA a = NFA::createSingleInputNFA('a');
NFA b = NFA::createSingleInputNFA('b');
NFA nfa = NFA::createAlternatingNFA(a, b);
const int initialA = 1;
const int finalA = 2;
const int initialB = 3;
const int finalB = 4;
QCOMPARE(nfa.states.count(), 6);
QCOMPARE(nfa.initialState, 0);
QCOMPARE(nfa.finalState, 5);
QList<int> initialTransitions = nfa.states.at(0).transitions.values(Epsilon);
QCOMPARE(initialTransitions.count(), 2);
QVERIFY(initialTransitions.contains(initialA));
QVERIFY(initialTransitions.contains(initialB));
// no need to test the individual a and b NFAs, the other
// autotest already takes care of that. Just check whether
// the epsilon transitions to the final state exist.
QCOMPARE(nfa.states.at(finalA).transitions.count(), 1);
QCOMPARE(nfa.states.at(finalA).transitions.values(Epsilon).count(), 1);
QCOMPARE(nfa.states.at(finalA).transitions.value(Epsilon), nfa.finalState);
QCOMPARE(nfa.states.at(finalB).transitions.count(), 1);
QCOMPARE(nfa.states.at(finalB).transitions.values(Epsilon).count(), 1);
QCOMPARE(nfa.states.at(finalB).transitions.value(Epsilon), nfa.finalState);
}
void tst_LexGen::nfa_concatenating()
{
NFA a = NFA::createSingleInputNFA('a');
NFA b = NFA::createSingleInputNFA('b');
NFA nfa = NFA::createConcatenatingNFA(a, b);
const int initialA = 1;
const int finalA = 2;
const int initialB = 3;
const int finalB = 4;
QCOMPARE(nfa.states.count(), 6);
QCOMPARE(nfa.initialState, 0);
QCOMPARE(nfa.finalState, 5);
QCOMPARE(nfa.states.at(0).transitions.count(), 1);
QCOMPARE(nfa.states.at(0).transitions.values(Epsilon).count(), 1);
QCOMPARE(nfa.states.at(0).transitions.value(Epsilon), initialA);
QCOMPARE(nfa.states.at(finalA).transitions.values(Epsilon).count(), 1);
QCOMPARE(nfa.states.at(finalA).transitions.value(Epsilon), initialB);
QCOMPARE(nfa.states.at(finalB).transitions.values(Epsilon).count(), 1);
QCOMPARE(nfa.states.at(finalB).transitions.value(Epsilon), nfa.finalState);
}
void tst_LexGen::nfa_optional()
{
NFA a = NFA::createSingleInputNFA('a');
NFA nfa = NFA::createOptionalNFA(a);
const int initialA = 1;
const int finalA = 2;
QCOMPARE(nfa.states.count(), 4);
QCOMPARE(nfa.initialState, 0);
QCOMPARE(nfa.finalState, 3);
QCOMPARE(nfa.states.at(0).transitions.count(), 2);
QList<int> initialTransitions = nfa.states.at(0).transitions.values(Epsilon);
QVERIFY(initialTransitions.contains(nfa.finalState));
QVERIFY(initialTransitions.contains(initialA));
QList<int> finalEpsilonATransitions = nfa.states.at(finalA).transitions.values(Epsilon);
QVERIFY(finalEpsilonATransitions.contains(initialA));
QVERIFY(finalEpsilonATransitions.contains(nfa.finalState));
}
Q_DECLARE_METATYPE(NFA);
Q_DECLARE_METATYPE(DFA);
void tst_LexGen::nfa_toDFA_data()
{
QTest::addColumn<NFA>("nfa");
QTest::addColumn<DFA>("expectedDFA");
NFA a = NFA::createSingleInputNFA('a');
NFA b = NFA::createSingleInputNFA('b');
NFA c = NFA::createSingleInputNFA('c');
NFA nfa;
DFA dfa;
dfa.clear();
dfa.resize(3);
dfa[0].transitions.insert('a', 1);
dfa[1].transitions.insert('b', 2);
nfa = NFA::createConcatenatingNFA(a, b);
QTest::newRow("simple concat") << nfa << dfa;
dfa.clear();
dfa.resize(3);
dfa[0].transitions.insert('a', 1);
dfa[0].transitions.insert('b', 2);
nfa = NFA::createAlternatingNFA(a, b);
QTest::newRow("simple alternate") << nfa << dfa;
}
void tst_LexGen::nfa_toDFA()
{
QFETCH(NFA, nfa);
QFETCH(DFA, expectedDFA);
DFA dfa = nfa.toDFA();
QCOMPARE(dfa.count(), expectedDFA.count());
for (int i = 0; i < dfa.count(); ++i) {
if (dfa.at(i).transitions != expectedDFA.at(i).transitions) {
qDebug() << "DFAs differ in state" << i;
qDebug() << "NFA:";
nfa.debug();
qDebug() << "Actual DFA:";
dfa.debug();
qDebug() << "Expected DFA:";
expectedDFA.debug();
QVERIFY(false);
}
}
}
void tst_LexGen::lexgen_data()
{
QTest::addColumn<QString>("ruleFile");
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("expectedOutput");
QDir d(QString(SRCDIR));
d.cd("testdata");
foreach (QString test, d.entryList(QDir::Dirs | QDir::NoDotAndDotDot)) {
QString dir = d.absoluteFilePath(test) + '/';
QTest::newRow(qPrintable(test))
<< dir + "rules.lexgen"
<< dir + "input"
<< dir + "output"
;
}
}
void tst_LexGen::lexgen()
{
QFETCH(QString, ruleFile);
QFETCH(QString, input);
QFETCH(QString, expectedOutput);
Config conf;
QVERIFY(loadConfig(ruleFile, &conf));
DFA dfa = generateMachine(conf);
QVERIFY(!dfa.isEmpty());
conf.debug = true;
QFile f(input);
QVERIFY(f.open(QIODevice::ReadOnly));
input = QString::fromUtf8(f.readAll());
f.close();
if (input.endsWith(QLatin1Char('\n')))
input.chop(1);
// machine.debug();
bool ok = false;
QList<Symbol> symbols = tokenize(dfa, input, &conf, &ok);
QVERIFY(ok);
f.setFileName(expectedOutput);
QVERIFY(f.open(QIODevice::ReadOnly));
QStringList lines;
while (!f.atEnd()) {
QString line = QString::fromUtf8(f.readLine());
if (line.endsWith(QLatin1Char('\n')))
line.chop(1);
lines << line;
}
f.close();
// dfa.debug();
QCOMPARE(lines.count(), symbols.count());
for (int i = 0; i < lines.count(); ++i) {
QStringList l = lines.at(i).split(QChar::fromLatin1('|'));
QCOMPARE(l.count(), 2);
QString expectedToken = l.at(0);
QString expectedLexem = l.at(1);
QCOMPARE(symbols.at(i).token, expectedToken);
QCOMPARE(symbols.at(i).lexem, expectedLexem);
}
}
QTEST_MAIN(tst_LexGen)
#include "tst_lexgen.moc"

199
util/lexgen/tokenizer.cpp Normal file
View File

@ -0,0 +1,199 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
// auto generated. DO NOT EDIT.
class RegExpTokenizer
{
public:
RegExpTokenizer(const QString &inp);
inline QChar next() {
return (pos < input.length()) ? input.at(pos++) : QChar();
}
int lex();
QString input;
int pos;
int lexemStart;
int lexemLength;
};
RegExpTokenizer::RegExpTokenizer(const QString &inp)
{
input = inp;
pos = 0;
lexemStart = 0;
lexemLength = 0;
}
int RegExpTokenizer::lex()
{
lexemStart = pos;
lexemLength = 0;
int lastAcceptingPos = -1;
int token = -1;
QChar ch;
// initial state
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 33)
goto state_1;
if (ch.unicode() == 34)
goto state_2;
if (ch.unicode() >= 35 && ch.unicode() <= 39)
goto state_1;
if (ch.unicode() == 40) {
token = RE2NFA::TOK_LPAREN;
goto found;
}
if (ch.unicode() == 41) {
token = RE2NFA::TOK_RPAREN;
goto found;
}
if (ch.unicode() == 42) {
token = RE2NFA::TOK_STAR;
goto found;
}
if (ch.unicode() == 43) {
token = RE2NFA::TOK_PLUS;
goto found;
}
if (ch.unicode() == 44) {
token = RE2NFA::TOK_COMMA;
goto found;
}
if (ch.unicode() == 45)
goto state_1;
if (ch.unicode() == 46) {
token = RE2NFA::TOK_DOT;
goto found;
}
if (ch.unicode() >= 47 && ch.unicode() <= 62)
goto state_1;
if (ch.unicode() == 63) {
token = RE2NFA::TOK_QUESTION;
goto found;
}
if (ch.unicode() >= 64 && ch.unicode() <= 90)
goto state_1;
if (ch.unicode() == 91)
goto state_10;
if (ch.unicode() == 92)
goto state_11;
if (ch.unicode() >= 93 && ch.unicode() <= 122)
goto state_1;
if (ch.unicode() == 123) {
token = RE2NFA::TOK_LBRACE;
goto found;
}
if (ch.unicode() == 124) {
token = RE2NFA::TOK_OR;
goto found;
}
if (ch.unicode() == 125) {
token = RE2NFA::TOK_RBRACE;
goto found;
}
if (ch.unicode() >= 126)
goto state_1;
goto out;
state_1:
lastAcceptingPos = pos;
token = RE2NFA::TOK_STRING;
goto out;
state_2:
lastAcceptingPos = pos;
token = RE2NFA::TOK_STRING;
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 33)
goto state_15;
if (ch.unicode() == 34)
goto state_16;
if (ch.unicode() >= 35)
goto state_15;
goto out;
state_10:
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 91)
goto state_17;
if (ch.unicode() == 92)
goto state_18;
if (ch.unicode() == 93)
goto state_19;
if (ch.unicode() >= 94)
goto state_17;
goto out;
state_11:
lastAcceptingPos = pos;
token = RE2NFA::TOK_STRING;
ch = next();
if (ch.unicode() >= 1)
goto state_20;
goto out;
state_15:
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 33)
goto state_15;
if (ch.unicode() == 34)
goto state_16;
if (ch.unicode() >= 35)
goto state_15;
goto out;
state_16:
lastAcceptingPos = pos;
token = RE2NFA::TOK_QUOTED_STRING;
goto out;
state_17:
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 91)
goto state_17;
if (ch.unicode() == 92)
goto state_18;
if (ch.unicode() == 93)
goto state_19;
if (ch.unicode() >= 94)
goto state_17;
goto out;
state_18:
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 91)
goto state_17;
if (ch.unicode() == 92)
goto state_18;
if (ch.unicode() == 93)
goto state_21;
if (ch.unicode() >= 94)
goto state_17;
goto out;
state_19:
lastAcceptingPos = pos;
token = RE2NFA::TOK_SEQUENCE;
goto out;
state_20:
lastAcceptingPos = pos;
token = RE2NFA::TOK_STRING;
goto out;
state_21:
lastAcceptingPos = pos;
token = RE2NFA::TOK_SEQUENCE;
ch = next();
if (ch.unicode() >= 1 && ch.unicode() <= 91)
goto state_17;
if (ch.unicode() == 92)
goto state_18;
if (ch.unicode() == 93)
goto state_19;
if (ch.unicode() >= 94)
goto state_17;
goto out;
found:
lastAcceptingPos = pos;
out:
if (lastAcceptingPos != -1) {
lexemLength = lastAcceptingPos - lexemStart;
pos = lastAcceptingPos;
}
return token;
}