Initial commit from sysy-main

This commit is contained in:
Lixuanwang
2025-02-27 23:14:53 +08:00
commit cc523fd30b
1125 changed files with 257793 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include "Any.h"
using namespace antlrcpp;

View File

@@ -0,0 +1,16 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
// A standard C++ class loosely modeled after boost::Any.
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
using Any = std::any;
} // namespace antlrcpp

View File

@@ -0,0 +1,43 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include "tree/ParseTree.h"
#include "Exceptions.h"
#include "support/Arrays.h"
using namespace antlrcpp;
std::string Arrays::listToString(const std::vector<std::string> &list, const std::string &separator)
{
std::stringstream ss;
bool firstEntry = true;
ss << '[';
for (const auto &entry : list) {
ss << entry;
if (firstEntry) {
ss << separator;
firstEntry = false;
}
}
ss << ']';
return ss.str();
}
template <>
std::string Arrays::toString(const std::vector<antlr4::tree::ParseTree*> &source) {
std::string result = "[";
bool firstEntry = true;
for (auto *value : source) {
result += value->toStringTree();
if (firstEntry) {
result += ", ";
firstEntry = false;
}
}
return result + "]";
}

View File

@@ -0,0 +1,149 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
class ANTLR4CPP_PUBLIC Arrays {
public:
static std::string listToString(const std::vector<std::string> &list, const std::string &separator);
template <typename T>
static bool equals(const std::vector<T> &a, const std::vector<T> &b) {
if (a.size() != b.size())
return false;
for (size_t i = 0; i < a.size(); ++i)
if (!(a[i] == b[i]))
return false;
return true;
}
template <typename T>
static bool equals(const std::vector<T *> &a, const std::vector<T *> &b) {
if (a.size() != b.size())
return false;
for (size_t i = 0; i < a.size(); ++i) {
if (!a[i] && !b[i])
continue;
if (!a[i] || !b[i])
return false;
if (a[i] == b[i])
continue;
if (!(*a[i] == *b[i]))
return false;
}
return true;
}
template <typename T>
static bool equals(const std::vector<Ref<T>> &a, const std::vector<Ref<T>> &b) {
if (a.size() != b.size())
return false;
for (size_t i = 0; i < a.size(); ++i) {
if (!a[i] && !b[i])
continue;
if (!a[i] || !b[i])
return false;
if (a[i] == b[i])
continue;
if (!(*a[i] == *b[i]))
return false;
}
return true;
}
template <typename T>
static bool equals(const std::vector<std::unique_ptr<T>> &a, const std::vector<std::unique_ptr<T>> &b) {
if (a.size() != b.size())
return false;
for (size_t i = 0; i < a.size(); ++i) {
if (!a[i] && !b[i])
continue;
if (!a[i] || !b[i])
return false;
if (a[i] == b[i])
continue;
if (!(*a[i] == *b[i]))
return false;
}
return true;
}
template <typename T>
static std::string toString(const std::vector<T> &source) {
std::string result = "[";
bool firstEntry = true;
for (auto &value : source) {
result += value.toString();
if (firstEntry) {
result += ", ";
firstEntry = false;
}
}
return result + "]";
}
template <typename T>
static std::string toString(const std::vector<Ref<T>> &source) {
std::string result = "[";
bool firstEntry = true;
for (auto &value : source) {
result += value->toString();
if (firstEntry) {
result += ", ";
firstEntry = false;
}
}
return result + "]";
}
template <typename T>
static std::string toString(const std::vector<std::unique_ptr<T>> &source) {
std::string result = "[";
bool firstEntry = true;
for (auto &value : source) {
result += value->toString();
if (firstEntry) {
result += ", ";
firstEntry = false;
}
}
return result + "]";
}
template <typename T>
static std::string toString(const std::vector<T *> &source) {
std::string result = "[";
bool firstEntry = true;
for (auto value : source) {
result += value->toString();
if (firstEntry) {
result += ", ";
firstEntry = false;
}
}
return result + "]";
}
};
template <>
std::string Arrays::toString(const std::vector<antlr4::tree::ParseTree *> &source);
}

View File

@@ -0,0 +1,76 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
class ANTLR4CPP_PUBLIC BitSet : public std::bitset<2048> {
public:
size_t nextSetBit(size_t pos) const {
for (size_t i = pos; i < size(); i++){
if (test(i)) {
return i;
}
}
return INVALID_INDEX;
}
// Prints a list of every index for which the bitset contains a bit in true.
friend std::wostream& operator << (std::wostream& os, const BitSet& obj)
{
os << "{";
size_t total = obj.count();
for (size_t i = 0; i < obj.size(); i++){
if (obj.test(i)){
os << i;
--total;
if (total > 1){
os << ", ";
}
}
}
os << "}";
return os;
}
static std::string subStringRepresentation(const std::vector<BitSet>::iterator &begin,
const std::vector<BitSet>::iterator &end) {
std::string result;
std::vector<BitSet>::iterator vectorIterator;
for (vectorIterator = begin; vectorIterator != end; vectorIterator++) {
result += vectorIterator->toString();
}
// Grab the end
result += end->toString();
return result;
}
std::string toString() const {
std::stringstream stream;
stream << "{";
bool valueAdded = false;
for (size_t i = 0; i < size(); ++i){
if (test(i)){
if (valueAdded) {
stream << ", ";
}
stream << i;
valueAdded = true;
}
}
stream << "}";
return stream.str();
}
};
}

View File

@@ -0,0 +1,207 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include "support/CPPUtils.h"
namespace antlrcpp {
std::string join(const std::vector<std::string> &strings, const std::string &separator) {
std::string str;
bool firstItem = true;
for (const std::string &s : strings) {
if (!firstItem) {
str.append(separator);
}
firstItem = false;
str.append(s);
}
return str;
}
std::map<std::string, size_t> toMap(const std::vector<std::string> &keys) {
std::map<std::string, size_t> result;
for (size_t i = 0; i < keys.size(); ++i) {
result.insert({ keys[i], i });
}
return result;
}
std::string escapeWhitespace(std::string str, bool escapeSpaces) {
std::string result;
for (auto c : str) {
switch (c) {
case '\n':
result += "\\n";
break;
case '\r':
result += "\\r";
break;
case '\t':
result += "\\t";
break;
case ' ':
if (escapeSpaces) {
result += "\u00B7";
break;
}
result += c;
break;
default:
result += c;
break;
}
}
return result;
}
std::string toHexString(const int t) {
std::stringstream stream;
stream << std::uppercase << std::hex << t;
return stream.str();
}
std::string arrayToString(const std::vector<std::string> &data) {
std::string answer;
size_t toReserve = 0;
for (const auto &sub : data) {
toReserve += sub.size();
}
answer.reserve(toReserve);
for (const auto &sub: data) {
answer.append(sub);
}
return answer;
}
std::string replaceString(const std::string &s, const std::string &from, const std::string &to) {
std::string::size_type p;
std::string ss, res;
ss = s;
p = ss.find(from);
while (p != std::string::npos) {
if (p > 0)
res.append(ss.substr(0, p)).append(to);
else
res.append(to);
ss = ss.substr(p + from.size());
p = ss.find(from);
}
res.append(ss);
return res;
}
std::vector<std::string> split(const std::string &s, const std::string &sep, int count) {
std::vector<std::string> parts;
std::string ss = s;
std::string::size_type p;
if (s.empty())
return parts;
if (count == 0)
count= -1;
p = ss.find(sep);
while (!ss.empty() && p != std::string::npos && (count < 0 || count > 0)) {
parts.push_back(ss.substr(0, p));
ss = ss.substr(p+sep.size());
--count;
p = ss.find(sep);
}
parts.push_back(ss);
return parts;
}
//--------------------------------------------------------------------------------------------------
// Debugging helper. Adds indentation to all lines in the given string.
std::string indent(const std::string &s, const std::string &indentation, bool includingFirst) {
std::vector<std::string> parts = split(s, "\n", -1);
for (size_t i = 0; i < parts.size(); ++i) {
if (i == 0 && !includingFirst)
continue;
parts[i].insert(0, indentation);
}
return join(parts, "\n");
}
//--------------------------------------------------------------------------------------------------
// Recursively get the error from a, possibly nested, exception.
#if defined(_MSC_FULL_VER) && _MSC_FULL_VER < 190023026
// No nested exceptions before VS 2015.
template <typename T>
std::exception_ptr get_nested(const T &/*e*/) {
try {
return nullptr;
}
catch (const std::bad_cast &) {
return nullptr;
}
}
#else
template <typename T>
std::exception_ptr get_nested(const T &e) {
try {
auto nested = dynamic_cast<const std::nested_exception&>(e);
return nested.nested_ptr();
}
catch (const std::bad_cast &) {
return nullptr;
}
}
#endif
std::string what(std::exception_ptr eptr) {
if (!eptr) {
throw std::bad_exception();
}
std::string result;
std::size_t nestCount = 0;
next: {
try {
std::exception_ptr yeptr;
std::swap(eptr, yeptr);
std::rethrow_exception(yeptr);
}
catch (const std::exception &e) {
result += e.what();
eptr = get_nested(e);
}
catch (const std::string &e) {
result += e;
}
catch (const char *e) {
result += e;
}
catch (...) {
result += "cannot be determined";
}
if (eptr) {
result += " (";
++nestCount;
goto next;
}
}
result += std::string(nestCount, ')');
return result;
}
} // namespace antlrcpp

View File

@@ -0,0 +1,65 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
ANTLR4CPP_PUBLIC std::string join(const std::vector<std::string> &strings, const std::string &separator);
ANTLR4CPP_PUBLIC std::map<std::string, size_t> toMap(const std::vector<std::string> &keys);
ANTLR4CPP_PUBLIC std::string escapeWhitespace(std::string str, bool escapeSpaces);
ANTLR4CPP_PUBLIC std::string toHexString(const int t);
ANTLR4CPP_PUBLIC std::string arrayToString(const std::vector<std::string> &data);
ANTLR4CPP_PUBLIC std::string replaceString(const std::string &s, const std::string &from, const std::string &to);
ANTLR4CPP_PUBLIC std::vector<std::string> split(const std::string &s, const std::string &sep, int count);
ANTLR4CPP_PUBLIC std::string indent(const std::string &s, const std::string &indentation, bool includingFirst = true);
// Using RAII + a lambda to implement a "finally" replacement.
template <typename OnEnd>
struct FinalAction {
FinalAction(OnEnd f) : _cleanUp { std::move(f) } {}
FinalAction(FinalAction &&other) :
_cleanUp(std::move(other._cleanUp)), _enabled(other._enabled) {
other._enabled = false; // Don't trigger the lambda after ownership has moved.
}
~FinalAction() { if (_enabled) _cleanUp(); }
void disable() { _enabled = false; }
private:
OnEnd _cleanUp;
bool _enabled {true};
};
template <typename OnEnd>
FinalAction<OnEnd> finally(OnEnd f) {
return FinalAction<OnEnd>(std::move(f));
}
// Convenience functions to avoid lengthy dynamic_cast() != nullptr checks in many places.
template <typename T1, typename T2>
inline bool is(T2 *obj) { // For pointer types.
return dynamic_cast<typename std::add_const<T1>::type>(obj) != nullptr;
}
template <typename T1, typename T2>
inline bool is(Ref<T2> const& obj) { // For shared pointers.
return dynamic_cast<T1 *>(obj.get()) != nullptr;
}
template <typename T>
std::string toString(const T &o) {
std::stringstream ss;
// typeid gives the mangled class name, but that's all what's possible
// in a portable way.
ss << typeid(o).name() << "@" << std::hex << reinterpret_cast<uintptr_t>(&o);
return ss.str();
}
// Get the error text from an exception pointer or the current exception.
ANTLR4CPP_PUBLIC std::string what(std::exception_ptr eptr = std::current_exception());
} // namespace antlrcpp

View File

@@ -0,0 +1,34 @@
/* Copyright (c) 2012-2021 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include <cassert>
#include <memory>
#include <type_traits>
namespace antlrcpp {
template <typename To, typename From>
To downCast(From* from) {
static_assert(std::is_pointer_v<To>, "Target type not a pointer.");
static_assert(std::is_base_of_v<From, std::remove_pointer_t<To>>, "Target type not derived from source type.");
#if !defined(__GNUC__) || defined(__GXX_RTTI)
assert(from == nullptr || dynamic_cast<To>(from) != nullptr);
#endif
return static_cast<To>(from);
}
template <typename To, typename From>
To downCast(From& from) {
static_assert(std::is_lvalue_reference_v<To>, "Target type not a lvalue reference.");
static_assert(std::is_base_of_v<From, std::remove_reference_t<To>>, "Target type not derived from source type.");
#if !defined(__GNUC__) || defined(__GXX_RTTI)
assert(dynamic_cast<std::add_pointer_t<std::remove_reference_t<To>>>(std::addressof(from)) != nullptr);
#endif
return static_cast<To>(from);
}
}

View File

@@ -0,0 +1,161 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
namespace antlr4 {
class ANTLRErrorListener;
class ANTLRErrorStrategy;
class ANTLRFileStream;
class ANTLRInputStream;
class BailErrorStrategy;
class BaseErrorListener;
class BufferedTokenStream;
class CharStream;
class CommonToken;
class CommonTokenFactory;
class CommonTokenStream;
class ConsoleErrorListener;
class DefaultErrorStrategy;
class DiagnosticErrorListener;
class EmptyStackException;
class FailedPredicateException;
class IllegalArgumentException;
class IllegalStateException;
class InputMismatchException;
class IntStream;
class InterpreterRuleContext;
class Lexer;
class LexerInterpreter;
class LexerNoViableAltException;
class ListTokenSource;
class NoSuchElementException;
class NoViableAltException;
class NullPointerException;
class ParseCancellationException;
class Parser;
class ParserInterpreter;
class ParserRuleContext;
class ProxyErrorListener;
class RecognitionException;
class Recognizer;
class RuleContext;
class Token;
template<typename Symbol> class TokenFactory;
class TokenSource;
class TokenStream;
class TokenStreamRewriter;
class UnbufferedCharStream;
class UnbufferedTokenStream;
class WritableToken;
namespace misc {
class InterpreterDataReader;
class Interval;
class IntervalSet;
class MurmurHash;
class Utils;
class Predicate;
}
namespace atn {
class ATN;
class ATNConfig;
class ATNConfigSet;
class ATNDeserializationOptions;
class ATNDeserializer;
class ATNSerializer;
class ATNSimulator;
class ATNState;
enum class ATNType;
class ActionTransition;
class ArrayPredictionContext;
class AtomTransition;
class BasicBlockStartState;
class BasicState;
class BlockEndState;
class BlockStartState;
class DecisionState;
class EpsilonTransition;
class LL1Analyzer;
class LexerAction;
class LexerActionExecutor;
class LexerATNConfig;
class LexerATNSimulator;
class LexerMoreAction;
class LexerPopModeAction;
class LexerSkipAction;
class LookaheadEventInfo;
class LoopEndState;
class NotSetTransition;
class OrderedATNConfigSet;
class ParseInfo;
class ParserATNSimulator;
class PlusBlockStartState;
class PlusLoopbackState;
class PrecedencePredicateTransition;
class PredicateTransition;
class PredictionContext;
enum class PredictionMode;
class PredictionModeClass;
class RangeTransition;
class RuleStartState;
class RuleStopState;
class RuleTransition;
class SemanticContext;
class SetTransition;
class SingletonPredictionContext;
class StarBlockStartState;
class StarLoopEntryState;
class StarLoopbackState;
class TokensStartState;
class Transition;
class WildcardTransition;
}
namespace dfa {
class DFA;
class DFASerializer;
class DFAState;
class LexerDFASerializer;
class Vocabulary;
}
namespace tree {
class AbstractParseTreeVisitor;
class ErrorNode;
class ErrorNodeImpl;
class ParseTree;
class ParseTreeListener;
template<typename T> class ParseTreeProperty;
class ParseTreeVisitor;
class ParseTreeWalker;
class SyntaxTree;
class TerminalNode;
class TerminalNodeImpl;
class Tree;
class Trees;
namespace pattern {
class Chunk;
class ParseTreeMatch;
class ParseTreePattern;
class ParseTreePatternMatcher;
class RuleTagToken;
class TagChunk;
class TextChunk;
class TokenTagToken;
}
namespace xpath {
class XPath;
class XPathElement;
class XPathLexerErrorListener;
class XPathRuleAnywhereElement;
class XPathRuleElement;
class XPathTokenAnywhereElement;
class XPathTokenElement;
class XPathWildcardAnywhereElement;
class XPathWildcardElement;
}
}
}

View File

@@ -0,0 +1,38 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include "support/StringUtils.h"
namespace antlrcpp {
std::string escapeWhitespace(std::string_view in) {
std::string out;
escapeWhitespace(out, in);
out.shrink_to_fit();
return out;
}
std::string& escapeWhitespace(std::string& out, std::string_view in) {
out.reserve(in.size()); // Best case, no escaping.
for (const auto &c : in) {
switch (c) {
case '\t':
out.append("\\t");
break;
case '\r':
out.append("\\r");
break;
case '\n':
out.append("\\n");
break;
default:
out.push_back(c);
break;
}
}
return out;
}
} // namespace antrlcpp

View File

@@ -0,0 +1,16 @@
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
ANTLR4CPP_PUBLIC std::string escapeWhitespace(std::string_view in);
ANTLR4CPP_PUBLIC std::string& escapeWhitespace(std::string& out, std::string_view in);
}

View File

@@ -0,0 +1,28 @@
/* Copyright (c) 2021 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include "antlr4-common.h"
namespace antlrcpp {
class ANTLR4CPP_PUBLIC Unicode final {
public:
static constexpr char32_t REPLACEMENT_CHARACTER = 0xfffd;
static constexpr bool isValid(char32_t codePoint) {
return codePoint < 0xd800 || (codePoint > 0xdfff && codePoint <= 0x10ffff);
}
private:
Unicode() = delete;
Unicode(const Unicode&) = delete;
Unicode(Unicode&&) = delete;
Unicode& operator=(const Unicode&) = delete;
Unicode& operator=(Unicode&&) = delete;
};
}

View File

@@ -0,0 +1,242 @@
/* Copyright (c) 2021 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include <cassert>
#include <cstdint>
#include "support/Utf8.h"
#include "support/Unicode.h"
// The below implementation is based off of https://github.com/google/cel-cpp/internal/utf8.cc,
// which is itself based off of https://go.googlesource.com/go/+/refs/heads/master/src/unicode/utf8/utf8.go.
// If for some reason you feel the need to copy this implementation, please retain a comment
// referencing the two source files and giving credit, as well as maintaining any and all
// obligations required by the BSD 3-clause license that governs this file.
namespace antlrcpp {
namespace {
#undef SELF
constexpr uint8_t SELF = 0x80;
#undef LOW
constexpr uint8_t LOW = 0x80;
#undef HIGH
constexpr uint8_t HIGH = 0xbf;
#undef MASKX
constexpr uint8_t MASKX = 0x3f;
#undef MASK2
constexpr uint8_t MASK2 = 0x1f;
#undef MASK3
constexpr uint8_t MASK3 = 0xf;
#undef MASK4
constexpr uint8_t MASK4 = 0x7;
#undef TX
constexpr uint8_t TX = 0x80;
#undef T2
constexpr uint8_t T2 = 0xc0;
#undef T3
constexpr uint8_t T3 = 0xe0;
#undef T4
constexpr uint8_t T4 = 0xf0;
#undef XX
constexpr uint8_t XX = 0xf1;
#undef AS
constexpr uint8_t AS = 0xf0;
#undef S1
constexpr uint8_t S1 = 0x02;
#undef S2
constexpr uint8_t S2 = 0x13;
#undef S3
constexpr uint8_t S3 = 0x03;
#undef S4
constexpr uint8_t S4 = 0x23;
#undef S5
constexpr uint8_t S5 = 0x34;
#undef S6
constexpr uint8_t S6 = 0x04;
#undef S7
constexpr uint8_t S7 = 0x44;
// NOLINTBEGIN
// clang-format off
#undef LEADING
constexpr uint8_t LEADING[256] = {
// 1 2 3 4 5 6 7 8 9 A B C D E F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
};
// clang-format on
// NOLINTEND
#undef ACCEPT
constexpr std::pair<uint8_t, uint8_t> ACCEPT[16] = {
{LOW, HIGH}, {0xa0, HIGH}, {LOW, 0x9f}, {0x90, HIGH},
{LOW, 0x8f}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0},
{0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0},
{0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0}, {0x0, 0x0},
};
} // namespace
std::pair<char32_t, size_t> Utf8::decode(std::string_view input) {
assert(!input.empty());
const auto b = static_cast<uint8_t>(input.front());
input.remove_prefix(1);
if (b < SELF) {
return {static_cast<char32_t>(b), 1};
}
const auto leading = LEADING[b];
if (leading == XX) {
return {Unicode::REPLACEMENT_CHARACTER, 1};
}
auto size = static_cast<size_t>(leading & 7) - 1;
if (size > input.size()) {
return {Unicode::REPLACEMENT_CHARACTER, 1};
}
const auto& accept = ACCEPT[leading >> 4];
const auto b1 = static_cast<uint8_t>(input.front());
input.remove_prefix(1);
if (b1 < accept.first || b1 > accept.second) {
return {Unicode::REPLACEMENT_CHARACTER, 1};
}
if (size <= 1) {
return {(static_cast<char32_t>(b & MASK2) << 6) |
static_cast<char32_t>(b1 & MASKX),
2};
}
const auto b2 = static_cast<uint8_t>(input.front());
input.remove_prefix(1);
if (b2 < LOW || b2 > HIGH) {
return {Unicode::REPLACEMENT_CHARACTER, 1};
}
if (size <= 2) {
return {(static_cast<char32_t>(b & MASK3) << 12) |
(static_cast<char32_t>(b1 & MASKX) << 6) |
static_cast<char32_t>(b2 & MASKX),
3};
}
const auto b3 = static_cast<uint8_t>(input.front());
input.remove_prefix(1);
if (b3 < LOW || b3 > HIGH) {
return {Unicode::REPLACEMENT_CHARACTER, 1};
}
return {(static_cast<char32_t>(b & MASK4) << 18) |
(static_cast<char32_t>(b1 & MASKX) << 12) |
(static_cast<char32_t>(b2 & MASKX) << 6) |
static_cast<char32_t>(b3 & MASKX),
4};
}
std::optional<std::u32string> Utf8::strictDecode(std::string_view input) {
std::u32string output;
char32_t codePoint;
size_t codeUnits;
output.reserve(input.size()); // Worst case is each byte is a single Unicode code point.
for (size_t index = 0; index < input.size(); index += codeUnits) {
std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index));
if (codePoint == Unicode::REPLACEMENT_CHARACTER && codeUnits == 1) {
// Condition is only met when an illegal byte sequence is encountered. See Utf8::decode.
return std::nullopt;
}
output.push_back(codePoint);
}
output.shrink_to_fit();
return output;
}
std::u32string Utf8::lenientDecode(std::string_view input) {
std::u32string output;
char32_t codePoint;
size_t codeUnits;
output.reserve(input.size()); // Worst case is each byte is a single Unicode code point.
for (size_t index = 0; index < input.size(); index += codeUnits) {
std::tie(codePoint, codeUnits) = Utf8::decode(input.substr(index));
output.push_back(codePoint);
}
output.shrink_to_fit();
return output;
}
std::string& Utf8::encode(std::string* buffer, char32_t codePoint) {
assert(buffer != nullptr);
if (!Unicode::isValid(codePoint)) {
codePoint = Unicode::REPLACEMENT_CHARACTER;
}
if (codePoint <= 0x7f) {
buffer->push_back(static_cast<char>(static_cast<uint8_t>(codePoint)));
} else if (codePoint <= 0x7ff) {
buffer->push_back(
static_cast<char>(T2 | static_cast<uint8_t>(codePoint >> 6)));
buffer->push_back(
static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
} else if (codePoint <= 0xffff) {
buffer->push_back(
static_cast<char>(T3 | static_cast<uint8_t>(codePoint >> 12)));
buffer->push_back(static_cast<char>(
TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX)));
buffer->push_back(
static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
} else {
buffer->push_back(
static_cast<char>(T4 | static_cast<uint8_t>(codePoint >> 18)));
buffer->push_back(static_cast<char>(
TX | (static_cast<uint8_t>(codePoint >> 12) & MASKX)));
buffer->push_back(static_cast<char>(
TX | (static_cast<uint8_t>(codePoint >> 6) & MASKX)));
buffer->push_back(
static_cast<char>(TX | (static_cast<uint8_t>(codePoint) & MASKX)));
}
return *buffer;
}
std::optional<std::string> Utf8::strictEncode(std::u32string_view input) {
std::string output;
output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes.
for (size_t index = 0; index < input.size(); index++) {
char32_t codePoint = input[index];
if (!Unicode::isValid(codePoint)) {
return std::nullopt;
}
Utf8::encode(&output, codePoint);
}
output.shrink_to_fit();
return output;
}
std::string Utf8::lenientEncode(std::u32string_view input) {
std::string output;
output.reserve(input.size() * 4); // Worst case is each Unicode code point encodes to 4 bytes.
for (size_t index = 0; index < input.size(); index++) {
char32_t codePoint = input[index];
if (!Unicode::isValid(codePoint)) {
codePoint = Unicode::REPLACEMENT_CHARACTER;
}
Utf8::encode(&output, codePoint);
}
output.shrink_to_fit();
return output;
}
}

View File

@@ -0,0 +1,54 @@
/* Copyright (c) 2021 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include <optional>
#include <string>
#include <string_view>
#include <tuple>
#include "antlr4-common.h"
namespace antlrcpp {
class ANTLR4CPP_PUBLIC Utf8 final {
public:
// Decodes the next code point, returning the decoded code point and the number
// of code units (a.k.a. bytes) consumed. In the event that an invalid code unit
// sequence is returned the replacement character, U+FFFD, is returned with a
// code unit count of 1. As U+FFFD requires 3 code units when encoded, this can
// be used to differentiate valid input from malformed input.
static std::pair<char32_t, size_t> decode(std::string_view input);
// Decodes the given UTF-8 encoded input into a string of code points.
static std::optional<std::u32string> strictDecode(std::string_view input);
// Decodes the given UTF-8 encoded input into a string of code points. Unlike strictDecode(),
// each byte in an illegal byte sequence is replaced with the Unicode replacement character,
// U+FFFD.
static std::u32string lenientDecode(std::string_view input);
// Encodes the given code point and appends it to the buffer. If the code point
// is an unpaired surrogate or outside of the valid Unicode range it is replaced
// with the replacement character, U+FFFD.
static std::string& encode(std::string *buffer, char32_t codePoint);
// Encodes the given Unicode code point string as UTF-8.
static std::optional<std::string> strictEncode(std::u32string_view input);
// Encodes the given Unicode code point string as UTF-8. Unlike strictEncode(),
// each invalid Unicode code point is replaced with the Unicode replacement character, U+FFFD.
static std::string lenientEncode(std::u32string_view input);
private:
Utf8() = delete;
Utf8(const Utf8&) = delete;
Utf8(Utf8&&) = delete;
Utf8& operator=(const Utf8&) = delete;
Utf8& operator=(Utf8&&) = delete;
};
}