676 lines
21 KiB
C++
676 lines
21 KiB
C++
/*
|
|
==============================================================================
|
|
|
|
This file is part of the JUCE library.
|
|
Copyright (c) 2017 - ROLI Ltd.
|
|
|
|
JUCE is an open source library subject to commercial or open-source
|
|
licensing.
|
|
|
|
By using JUCE, you agree to the terms of both the JUCE 5 End-User License
|
|
Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
|
|
27th April 2017).
|
|
|
|
End User License Agreement: www.juce.com/juce-5-licence
|
|
Privacy Policy: www.juce.com/juce-5-privacy-policy
|
|
|
|
Or: You may also use this code under the terms of the GPL v3 (see
|
|
www.gnu.org/licenses).
|
|
|
|
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
|
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
|
DISCLAIMED.
|
|
|
|
==============================================================================
|
|
*/
|
|
|
|
namespace juce
|
|
{
|
|
|
|
//==============================================================================
|
|
/** Class containing some basic functions for simple tokenising of C++ code.
|
|
|
|
@tags{GUI}
|
|
*/
|
|
struct CppTokeniserFunctions
|
|
{
|
|
static bool isIdentifierStart (const juce_wchar c) noexcept
|
|
{
|
|
return CharacterFunctions::isLetter (c)
|
|
|| c == '_' || c == '@';
|
|
}
|
|
|
|
static bool isIdentifierBody (const juce_wchar c) noexcept
|
|
{
|
|
return CharacterFunctions::isLetterOrDigit (c)
|
|
|| c == '_' || c == '@';
|
|
}
|
|
|
|
static bool isReservedKeyword (String::CharPointerType token, const int tokenLength) noexcept
|
|
{
|
|
static const char* const keywords2Char[] =
|
|
{ "do", "if", "or", nullptr };
|
|
|
|
static const char* const keywords3Char[] =
|
|
{ "and", "asm", "for", "int", "new", "not", "try", "xor", nullptr };
|
|
|
|
static const char* const keywords4Char[] =
|
|
{ "auto", "bool", "case", "char", "else", "enum", "goto",
|
|
"long", "this", "true", "void", nullptr };
|
|
|
|
static const char* const keywords5Char[] =
|
|
{ "bitor", "break", "catch", "class", "compl", "const", "false", "final",
|
|
"float", "or_eq", "short", "throw", "union", "using", "while", nullptr };
|
|
|
|
static const char* const keywords6Char[] =
|
|
{ "and_eq", "bitand", "delete", "double", "export", "extern", "friend",
|
|
"import", "inline", "module", "not_eq", "public", "return", "signed",
|
|
"sizeof", "static", "struct", "switch", "typeid", "xor_eq", nullptr };
|
|
|
|
static const char* const keywords7Char[] =
|
|
{ "__cdecl", "_Pragma", "alignas", "alignof", "concept", "default",
|
|
"mutable", "nullptr", "private", "typedef", "uint8_t", "virtual",
|
|
"wchar_t", nullptr };
|
|
|
|
static const char* const keywordsOther[] =
|
|
{ "@class", "@dynamic", "@end", "@implementation", "@interface", "@public",
|
|
"@private", "@protected", "@property", "@synthesize", "__fastcall", "__stdcall",
|
|
"atomic_cancel", "atomic_commit", "atomic_noexcept", "char16_t", "char32_t",
|
|
"co_await", "co_return", "co_yield", "const_cast", "constexpr", "continue",
|
|
"decltype", "dynamic_cast", "explicit", "namespace", "noexcept", "operator", "override",
|
|
"protected", "register", "reinterpret_cast", "requires", "static_assert",
|
|
"static_cast", "synchronized", "template", "thread_local", "typename", "unsigned",
|
|
"volatile", nullptr };
|
|
|
|
const char* const* k;
|
|
|
|
switch (tokenLength)
|
|
{
|
|
case 2: k = keywords2Char; break;
|
|
case 3: k = keywords3Char; break;
|
|
case 4: k = keywords4Char; break;
|
|
case 5: k = keywords5Char; break;
|
|
case 6: k = keywords6Char; break;
|
|
case 7: k = keywords7Char; break;
|
|
|
|
default:
|
|
if (tokenLength < 2 || tokenLength > 16)
|
|
return false;
|
|
|
|
k = keywordsOther;
|
|
break;
|
|
}
|
|
|
|
for (int i = 0; k[i] != 0; ++i)
|
|
if (token.compare (CharPointer_ASCII (k[i])) == 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static int parseIdentifier (Iterator& source) noexcept
|
|
{
|
|
int tokenLength = 0;
|
|
String::CharPointerType::CharType possibleIdentifier[100];
|
|
String::CharPointerType possible (possibleIdentifier);
|
|
|
|
while (isIdentifierBody (source.peekNextChar()))
|
|
{
|
|
auto c = source.nextChar();
|
|
|
|
if (tokenLength < 20)
|
|
possible.write (c);
|
|
|
|
++tokenLength;
|
|
}
|
|
|
|
if (tokenLength > 1 && tokenLength <= 16)
|
|
{
|
|
possible.writeNull();
|
|
|
|
if (isReservedKeyword (String::CharPointerType (possibleIdentifier), tokenLength))
|
|
return CPlusPlusCodeTokeniser::tokenType_keyword;
|
|
}
|
|
|
|
return CPlusPlusCodeTokeniser::tokenType_identifier;
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static bool skipNumberSuffix (Iterator& source)
|
|
{
|
|
auto c = source.peekNextChar();
|
|
|
|
if (c == 'l' || c == 'L' || c == 'u' || c == 'U')
|
|
source.skip();
|
|
|
|
if (CharacterFunctions::isLetterOrDigit (source.peekNextChar()))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool isHexDigit (const juce_wchar c) noexcept
|
|
{
|
|
return (c >= '0' && c <= '9')
|
|
|| (c >= 'a' && c <= 'f')
|
|
|| (c >= 'A' && c <= 'F');
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static bool parseHexLiteral (Iterator& source) noexcept
|
|
{
|
|
if (source.peekNextChar() == '-')
|
|
source.skip();
|
|
|
|
if (source.nextChar() != '0')
|
|
return false;
|
|
|
|
auto c = source.nextChar();
|
|
|
|
if (c != 'x' && c != 'X')
|
|
return false;
|
|
|
|
int numDigits = 0;
|
|
|
|
while (isHexDigit (source.peekNextChar()))
|
|
{
|
|
++numDigits;
|
|
source.skip();
|
|
}
|
|
|
|
if (numDigits == 0)
|
|
return false;
|
|
|
|
return skipNumberSuffix (source);
|
|
}
|
|
|
|
static bool isOctalDigit (const juce_wchar c) noexcept
|
|
{
|
|
return c >= '0' && c <= '7';
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static bool parseOctalLiteral (Iterator& source) noexcept
|
|
{
|
|
if (source.peekNextChar() == '-')
|
|
source.skip();
|
|
|
|
if (source.nextChar() != '0')
|
|
return false;
|
|
|
|
if (! isOctalDigit (source.nextChar()))
|
|
return false;
|
|
|
|
while (isOctalDigit (source.peekNextChar()))
|
|
source.skip();
|
|
|
|
return skipNumberSuffix (source);
|
|
}
|
|
|
|
static bool isDecimalDigit (const juce_wchar c) noexcept
|
|
{
|
|
return c >= '0' && c <= '9';
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static bool parseDecimalLiteral (Iterator& source) noexcept
|
|
{
|
|
if (source.peekNextChar() == '-')
|
|
source.skip();
|
|
|
|
int numChars = 0;
|
|
while (isDecimalDigit (source.peekNextChar()))
|
|
{
|
|
++numChars;
|
|
source.skip();
|
|
}
|
|
|
|
if (numChars == 0)
|
|
return false;
|
|
|
|
return skipNumberSuffix (source);
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static bool parseFloatLiteral (Iterator& source) noexcept
|
|
{
|
|
if (source.peekNextChar() == '-')
|
|
source.skip();
|
|
|
|
int numDigits = 0;
|
|
|
|
while (isDecimalDigit (source.peekNextChar()))
|
|
{
|
|
source.skip();
|
|
++numDigits;
|
|
}
|
|
|
|
const bool hasPoint = (source.peekNextChar() == '.');
|
|
|
|
if (hasPoint)
|
|
{
|
|
source.skip();
|
|
|
|
while (isDecimalDigit (source.peekNextChar()))
|
|
{
|
|
source.skip();
|
|
++numDigits;
|
|
}
|
|
}
|
|
|
|
if (numDigits == 0)
|
|
return false;
|
|
|
|
auto c = source.peekNextChar();
|
|
bool hasExponent = (c == 'e' || c == 'E');
|
|
|
|
if (hasExponent)
|
|
{
|
|
source.skip();
|
|
c = source.peekNextChar();
|
|
|
|
if (c == '+' || c == '-')
|
|
source.skip();
|
|
|
|
int numExpDigits = 0;
|
|
|
|
while (isDecimalDigit (source.peekNextChar()))
|
|
{
|
|
source.skip();
|
|
++numExpDigits;
|
|
}
|
|
|
|
if (numExpDigits == 0)
|
|
return false;
|
|
}
|
|
|
|
c = source.peekNextChar();
|
|
|
|
if (c == 'f' || c == 'F')
|
|
source.skip();
|
|
else if (! (hasExponent || hasPoint))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static int parseNumber (Iterator& source)
|
|
{
|
|
const Iterator original (source);
|
|
|
|
if (parseFloatLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_float;
|
|
source = original;
|
|
|
|
if (parseHexLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer;
|
|
source = original;
|
|
|
|
if (parseOctalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer;
|
|
source = original;
|
|
|
|
if (parseDecimalLiteral (source)) return CPlusPlusCodeTokeniser::tokenType_integer;
|
|
source = original;
|
|
|
|
return CPlusPlusCodeTokeniser::tokenType_error;
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static void skipQuotedString (Iterator& source) noexcept
|
|
{
|
|
auto quote = source.nextChar();
|
|
|
|
for (;;)
|
|
{
|
|
auto c = source.nextChar();
|
|
|
|
if (c == quote || c == 0)
|
|
break;
|
|
|
|
if (c == '\\')
|
|
source.skip();
|
|
}
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static void skipComment (Iterator& source) noexcept
|
|
{
|
|
bool lastWasStar = false;
|
|
|
|
for (;;)
|
|
{
|
|
auto c = source.nextChar();
|
|
|
|
if (c == 0 || (c == '/' && lastWasStar))
|
|
break;
|
|
|
|
lastWasStar = (c == '*');
|
|
}
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static void skipPreprocessorLine (Iterator& source) noexcept
|
|
{
|
|
bool lastWasBackslash = false;
|
|
|
|
for (;;)
|
|
{
|
|
auto c = source.peekNextChar();
|
|
|
|
if (c == '"')
|
|
{
|
|
skipQuotedString (source);
|
|
continue;
|
|
}
|
|
|
|
if (c == '/')
|
|
{
|
|
Iterator next (source);
|
|
next.skip();
|
|
auto c2 = next.peekNextChar();
|
|
|
|
if (c2 == '/' || c2 == '*')
|
|
return;
|
|
}
|
|
|
|
if (c == 0)
|
|
break;
|
|
|
|
if (c == '\n' || c == '\r')
|
|
{
|
|
source.skipToEndOfLine();
|
|
|
|
if (lastWasBackslash)
|
|
skipPreprocessorLine (source);
|
|
|
|
break;
|
|
}
|
|
|
|
lastWasBackslash = (c == '\\');
|
|
source.skip();
|
|
}
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static void skipIfNextCharMatches (Iterator& source, const juce_wchar c) noexcept
|
|
{
|
|
if (source.peekNextChar() == c)
|
|
source.skip();
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static void skipIfNextCharMatches (Iterator& source, const juce_wchar c1, const juce_wchar c2) noexcept
|
|
{
|
|
auto c = source.peekNextChar();
|
|
|
|
if (c == c1 || c == c2)
|
|
source.skip();
|
|
}
|
|
|
|
template <typename Iterator>
|
|
static int readNextToken (Iterator& source)
|
|
{
|
|
source.skipWhitespace();
|
|
auto firstChar = source.peekNextChar();
|
|
|
|
switch (firstChar)
|
|
{
|
|
case 0:
|
|
break;
|
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
case '.':
|
|
{
|
|
auto result = parseNumber (source);
|
|
|
|
if (result == CPlusPlusCodeTokeniser::tokenType_error)
|
|
{
|
|
source.skip();
|
|
|
|
if (firstChar == '.')
|
|
return CPlusPlusCodeTokeniser::tokenType_punctuation;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
case ',':
|
|
case ';':
|
|
case ':':
|
|
source.skip();
|
|
return CPlusPlusCodeTokeniser::tokenType_punctuation;
|
|
|
|
case '(': case ')':
|
|
case '{': case '}':
|
|
case '[': case ']':
|
|
source.skip();
|
|
return CPlusPlusCodeTokeniser::tokenType_bracket;
|
|
|
|
case '"':
|
|
case '\'':
|
|
skipQuotedString (source);
|
|
return CPlusPlusCodeTokeniser::tokenType_string;
|
|
|
|
case '+':
|
|
source.skip();
|
|
skipIfNextCharMatches (source, '+', '=');
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
|
|
case '-':
|
|
{
|
|
source.skip();
|
|
auto result = parseNumber (source);
|
|
|
|
if (result == CPlusPlusCodeTokeniser::tokenType_error)
|
|
{
|
|
skipIfNextCharMatches (source, '-', '=');
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
case '*': case '%':
|
|
case '=': case '!':
|
|
source.skip();
|
|
skipIfNextCharMatches (source, '=');
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
|
|
case '/':
|
|
{
|
|
source.skip();
|
|
auto nextChar = source.peekNextChar();
|
|
|
|
if (nextChar == '/')
|
|
{
|
|
source.skipToEndOfLine();
|
|
return CPlusPlusCodeTokeniser::tokenType_comment;
|
|
}
|
|
|
|
if (nextChar == '*')
|
|
{
|
|
source.skip();
|
|
skipComment (source);
|
|
return CPlusPlusCodeTokeniser::tokenType_comment;
|
|
}
|
|
|
|
if (nextChar == '=')
|
|
source.skip();
|
|
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
}
|
|
|
|
case '?':
|
|
case '~':
|
|
source.skip();
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
|
|
case '<': case '>':
|
|
case '|': case '&': case '^':
|
|
source.skip();
|
|
skipIfNextCharMatches (source, firstChar);
|
|
skipIfNextCharMatches (source, '=');
|
|
return CPlusPlusCodeTokeniser::tokenType_operator;
|
|
|
|
case '#':
|
|
skipPreprocessorLine (source);
|
|
return CPlusPlusCodeTokeniser::tokenType_preprocessor;
|
|
|
|
default:
|
|
if (isIdentifierStart (firstChar))
|
|
return parseIdentifier (source);
|
|
|
|
source.skip();
|
|
break;
|
|
}
|
|
|
|
return CPlusPlusCodeTokeniser::tokenType_error;
|
|
}
|
|
|
|
/** A class that can be passed to the CppTokeniserFunctions functions in order to
|
|
parse a String.
|
|
*/
|
|
struct StringIterator
|
|
{
|
|
StringIterator (const String& s) noexcept : t (s.getCharPointer()) {}
|
|
StringIterator (String::CharPointerType s) noexcept : t (s) {}
|
|
|
|
juce_wchar nextChar() noexcept { if (isEOF()) return 0; ++numChars; return t.getAndAdvance(); }
|
|
juce_wchar peekNextChar()noexcept { return *t; }
|
|
void skip() noexcept { if (! isEOF()) { ++t; ++numChars; } }
|
|
void skipWhitespace() noexcept { while (t.isWhitespace()) skip(); }
|
|
void skipToEndOfLine() noexcept { while (*t != '\r' && *t != '\n' && *t != 0) skip(); }
|
|
bool isEOF() const noexcept { return t.isEmpty(); }
|
|
|
|
String::CharPointerType t;
|
|
int numChars = 0;
|
|
};
|
|
|
|
//==============================================================================
|
|
/** Takes a UTF8 string and writes it to a stream using standard C++ escape sequences for any
|
|
non-ascii bytes.
|
|
|
|
Although not strictly a tokenising function, this is still a function that often comes in
|
|
handy when working with C++ code!
|
|
|
|
Note that addEscapeChars() is easier to use than this function if you're working with Strings.
|
|
|
|
@see addEscapeChars
|
|
*/
|
|
static void writeEscapeChars (OutputStream& out, const char* utf8, const int numBytesToRead,
|
|
const int maxCharsOnLine, const bool breakAtNewLines,
|
|
const bool replaceSingleQuotes, const bool allowStringBreaks)
|
|
{
|
|
int charsOnLine = 0;
|
|
bool lastWasHexEscapeCode = false;
|
|
bool trigraphDetected = false;
|
|
|
|
for (int i = 0; i < numBytesToRead || numBytesToRead < 0; ++i)
|
|
{
|
|
auto c = (unsigned char) utf8[i];
|
|
bool startNewLine = false;
|
|
|
|
switch (c)
|
|
{
|
|
|
|
case '\t': out << "\\t"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break;
|
|
case '\r': out << "\\r"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break;
|
|
case '\n': out << "\\n"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; startNewLine = breakAtNewLines; break;
|
|
case '\\': out << "\\\\"; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break;
|
|
case '\"': out << "\\\""; trigraphDetected = false; lastWasHexEscapeCode = false; charsOnLine += 2; break;
|
|
|
|
case '?':
|
|
if (trigraphDetected)
|
|
{
|
|
out << "\\?";
|
|
charsOnLine++;
|
|
trigraphDetected = false;
|
|
}
|
|
else
|
|
{
|
|
out << "?";
|
|
trigraphDetected = true;
|
|
}
|
|
|
|
lastWasHexEscapeCode = false;
|
|
charsOnLine++;
|
|
break;
|
|
|
|
case 0:
|
|
if (numBytesToRead < 0)
|
|
return;
|
|
|
|
out << "\\0";
|
|
lastWasHexEscapeCode = true;
|
|
trigraphDetected = false;
|
|
charsOnLine += 2;
|
|
break;
|
|
|
|
case '\'':
|
|
if (replaceSingleQuotes)
|
|
{
|
|
out << "\\\'";
|
|
lastWasHexEscapeCode = false;
|
|
trigraphDetected = false;
|
|
charsOnLine += 2;
|
|
break;
|
|
}
|
|
|
|
// deliberate fall-through...
|
|
|
|
default:
|
|
if (c >= 32 && c < 127 && ! (lastWasHexEscapeCode // (have to avoid following a hex escape sequence with a valid hex digit)
|
|
&& CharacterFunctions::getHexDigitValue (c) >= 0))
|
|
{
|
|
out << (char) c;
|
|
lastWasHexEscapeCode = false;
|
|
trigraphDetected = false;
|
|
++charsOnLine;
|
|
}
|
|
else if (allowStringBreaks && lastWasHexEscapeCode && c >= 32 && c < 127)
|
|
{
|
|
out << "\"\"" << (char) c;
|
|
lastWasHexEscapeCode = false;
|
|
trigraphDetected = false;
|
|
charsOnLine += 3;
|
|
}
|
|
else
|
|
{
|
|
out << (c < 16 ? "\\x0" : "\\x") << String::toHexString ((int) c);
|
|
lastWasHexEscapeCode = true;
|
|
trigraphDetected = false;
|
|
charsOnLine += 4;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if ((startNewLine || (maxCharsOnLine > 0 && charsOnLine >= maxCharsOnLine))
|
|
&& (numBytesToRead < 0 || i < numBytesToRead - 1))
|
|
{
|
|
charsOnLine = 0;
|
|
out << "\"" << newLine << "\"";
|
|
lastWasHexEscapeCode = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Takes a string and returns a version of it where standard C++ escape sequences have been
|
|
used to replace any non-ascii bytes.
|
|
|
|
Although not strictly a tokenising function, this is still a function that often comes in
|
|
handy when working with C++ code!
|
|
|
|
@see writeEscapeChars
|
|
*/
|
|
static String addEscapeChars (const String& s)
|
|
{
|
|
MemoryOutputStream mo;
|
|
writeEscapeChars (mo, s.toRawUTF8(), -1, -1, false, true, true);
|
|
return mo.toString();
|
|
}
|
|
};
|
|
|
|
} // namespace juce
|