diff --git a/CMakeLists.txt b/CMakeLists.txt index 3de8890..3cc5dfe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,4 @@ include_directories(./include) aux_source_directory(src SRC_FILES) -add_executable(LR1Generator LR1Generator.cpp ${SRC_FILES}) - -add_executable(syntaxParser LR1Generator.cpp ${SRC_FILES}) \ No newline at end of file +add_executable(LR1Compiler Compiler.cpp ${SRC_FILES}) \ No newline at end of file diff --git a/LR1Generator.cpp b/Compiler.cpp similarity index 52% rename from LR1Generator.cpp rename to Compiler.cpp index 603eb80..451c341 100644 --- a/LR1Generator.cpp +++ b/Compiler.cpp @@ -1,11 +1,15 @@ #include #include +#include + #include #include - #include +#include +#include + using std::vector; using std::wstring; @@ -13,14 +17,33 @@ using std::wcout; using std::endl; -#include -#include +int main(int argc, const char* argv[]) { - -int main() { try { - clock_t start, end;//定义clock_t变量 - start = clock(); //开始时间 + + printf("Compile Program Based on LR(1) Written By Saturneric\n"); + + if (argc < 2) { + printf("Usage: \n"); + return -1; + } + + clock_t start, end; + start = clock(); + + Automata atm(argv[1]); + + atm.parse(); + + atm.output(); + + // 输出时间 + end = clock(); + double times = double(end - start) / CLOCKS_PER_SEC; + wcout << "Token Automata Run time = " << times << "s MicroSeconds" << " = " << times * 1000 << "ms" << endl; + + // LR1生成 + start = clock(); const GrammarResourcePool *pool; @@ -37,9 +60,11 @@ int main() { //输出时间 end = clock(); //结束时间 - double times = double(end - start) / CLOCKS_PER_SEC; - wcout << "LR1Generator Run time = " << times << "s MicroSeconds" << " = " << times * 1000 << "ms" << endl; + times = double(end - start) / CLOCKS_PER_SEC; + wcout << "LR(1) Generator Run time = " << times << "s MicroSeconds" << " = " << times * 1000 << "ms" << endl; + + // 语法分析 start = clock(); //开始时间 SyntaxParser syntaxParser(pool, atg); @@ -51,7 +76,8 @@ int main() { //输出时间 end = clock(); //结束时间 times = double(end - start) / CLOCKS_PER_SEC; - wcout << "SyntaxParser Run time = " << times << "s MicroSeconds " << " = " << times * 1000 << "ms" << endl; + wcout << "Syntax Parser Run time = " << times << "s MicroSeconds " << " = " << times * 1000 << "ms" << endl; + } catch(std::runtime_error &e) { std::wcout << "Runtime Error: " << e.what() << endl; } diff --git a/SyntaxParser.cpp b/SyntaxParser.cpp deleted file mode 100644 index 706f558..0000000 --- a/SyntaxParser.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// -// Created by Administrator on 2021/4/30. -// - -int main() { - -} \ No newline at end of file diff --git a/cmake-build-debug/tokenOut.txt b/cmake-build-debug/tokenOut.txt index a2e1df2..3507f46 100644 --- a/cmake-build-debug/tokenOut.txt +++ b/cmake-build-debug/tokenOut.txt @@ -2,7 +2,7 @@ 2 FLOAT(float) ID(a1) SEMICOLON(;) 3 DOUBLE(double) ID(a2) COMMA(,) ID(a3) COMMA(,) ID(a4) SEMICOLON(;) 4 LONG(long) DOUBLE(double) ID(a5) SEMICOLON(;) -5 STRUCT(struct) ID(warp_int) OPENING_BRACE({) +5 STRUCT(struct) ID(warp_int) OPENING_BRACE({) 6 INT8(int8) ID(i1) SEMICOLON(;) 7 INT16(int16) ID(i2) SEMICOLON(;) 8 INT32(int32) ID(i3) COMMA(,) ID(i4) SEMICOLON(;) @@ -21,13 +21,13 @@ 21 UNSIGNED(unsigned) LONG(long) LONG(long) ID(s9) SEMICOLON(;) 22 CLOSING_BRACE(}) ID(sign1) COMMA(,) ID(SIGN2) SEMICOLON(;) 23 CHAR(char) ID(a_6) LEFT_BRACKET([) STRING("compile") PLUS(+) STRING("studying") COMMA(,) STRING("\40") COMMA(,) STRING("abs\b\t\n\f\r\"\\abs") RIGHT_BRACKET(]) SEMICOLON(;) -24 BOOLEAN(boolean) ID(a_bool_7) LEFT_BRACKET([) INTEGER(10) INSERT(^) INTEGER(2) COMMA(,) INTEGER(1) AND(&) INTEGER(2) AND(&) INTEGER(3) COMMA(,) TRUE(TRUE) DELIMITER(|) FALSE(FALSE) COMMA(,) TILDE(~) FALSE(FALSE) DELIMITER(|) TILDE(~) TRUE(TRUE) RIGHT_BRACKET(]) SEMICOLON(;) +24 BOOLEAN(boolean) ID(a_bool_7) LEFT_BRACKET([) INTEGER(10) INSERT(^) INTEGER(2) COMMA(,) INTEGER(1) AND(&) INTEGER(2) AND(&) INTEGER(3) COMMA(,) TRUE(TRUE) DELIMITER(|) FALSE(FALSE) COMMA(,) TILDE(~) FALSE(FALSE) DELIMITER(|) TILDE(~) TRUE(TRUE) RIGHT_BRACKET(]) SEMICOLON(;) 25 LONG(long) ID(a8) LEFT_BRACKET([) INTEGER(1024) RIGHT_SHIFT(>>) INTEGER(10) COMMA(,) INTEGER(0) LEFT_SHIFT(<<) INTEGER(10) COMMA(,) INTEGER(100) MULT(*) INTEGER(2) SLASH(/) INTEGER(10) PERCENT(%) INTEGER(2) COMMA(,) INTEGER(100) PLUS(+) INTEGER(21) SUB(-) INTEGER(19) RIGHT_BRACKET(]) SEMICOLON(;) 26 BOOLEAN(boolean) ID(a9) LEFT_BRACKET([) INTEGER(10) INSERT(^) INTEGER(2) AND(&) INTEGER(3) DELIMITER(|) SUB(-) INTEGER(1) RIGHT_SHIFT(>>) INTEGER(10) AND(&) INTEGER(100) LEFT_SHIFT(<<) SUB(-) INTEGER(10) SUB(-) INTEGER(10) PLUS(+) INTEGER(100) MULT(*) INTEGER(2) SLASH(/) INTEGER(10) PERCENT(%) INTEGER(2) RIGHT_BRACKET(]) SEMICOLON(;) 27 STRUCT(struct) ID(warp_1) OPENING_BRACE({) 28 FLOAT(float) ID(w1) SEMICOLON(;) 29 LONG(long) ID(w2) SEMICOLON(;) -30 STRUCT(struct) ID(warp_2) OPENING_BRACE({) +30 STRUCT(struct) ID(warp_2) OPENING_BRACE({) 31 BOOLEAN(boolean) ID(w3) LEFT_BRACKET([) INTEGER(111) AND(&) INTEGER(2) RIGHT_BRACKET(]) SEMICOLON(;) 32 CHAR(char) ID(w4) LEFT_BRACKET([) STRING("\40\b\t\n\f\r\"\\\40") RIGHT_BRACKET(]) SEMICOLON(;) 33 CLOSING_BRACE(}) ID(w5) COMMA(,) ID(w6) SEMICOLON(;) diff --git a/cmake-build-release/syntaxInput.txt b/cmake-build-release/syntaxInput.txt new file mode 100644 index 0000000..9a8810d --- /dev/null +++ b/cmake-build-release/syntaxInput.txt @@ -0,0 +1,89 @@ +@struct_type -> "STRUCT" "ID" "OPENING_BRACE" member_list "CLOSING_BRACE" more_struct_type +more_struct_type -> "EOF" +more_struct_type -> 蔚 + +member_list -> type_spec declarators "SEMICOLON" member_list +member_list -> 蔚 + +type_spec -> base_type_spec +type_spec -> @struct_type +base_type_spec -> floating_pt_type +base_type_spec -> integer_type +base_type_spec -> "CHAR" +base_type_spec -> "BOOLEAN" +floating_pt_type -> "FLOAT" +floating_pt_type -> "DOUBLE" +floating_pt_type -> "LONG" "DOUBLE" +integer_type -> signed_int +integer_type -> unsigned_int + +signed_int -> "SHORT" +signed_int -> "INT16" +signed_int -> "LONG" +signed_int -> "INT32" +signed_int -> "LONG" "LONG" +signed_int -> "INT64" +signed_int -> "INT8" + +unsigned_int -> "UNSIGNED" "SHORT" +unsigned_int -> "UNSIGNED" "LONG" +unsigned_int -> "UNSIGNED" "LONG" "LONG" +unsigned_int -> "UINT16" +unsigned_int -> "UINT32" +unsigned_int -> "UINT64" +unsigned_int -> "UINT8" + +declarators -> declarator more_declarators +more_declarators -> "COMMA" declarator more_declarators +more_declarators -> 蔚 + +declarator -> "ID" more_declarator +more_declarator -> exp_list +more_declarator -> 蔚 + +exp_list -> "LEFT_BRACKET" or_expr more_or_expr "RIGHT_BRACKET" +more_or_expr -> "COMMA" or_expr more_or_expr +more_or_expr -> 蔚 + +or_expr -> xor_expr more_xor_expr +more_xor_expr -> "DELIMITER" xor_expr more_xor_expr +more_xor_expr -> 蔚 + +xor_expr -> and_expr more_and_expr +more_and_expr -> "INSERT" and_expr more_and_expr +more_and_expr -> 蔚 + +and_expr -> shift_expr more_shift_expr +more_shift_expr -> "AND" shift_expr more_shift_expr +more_shift_expr -> 蔚 + +shift_expr -> add_expr more_add_expr +more_add_expr -> shift_sign add_expr more_add_expr +shift_sign -> "RIGHT_SHIFT" +shift_sign -> "LEFT_SHIFT" +more_add_expr -> 蔚 + +add_expr -> multi_expr more_multi_expr +more_multi_expr -> multi_sign multi_expr more_multi_expr +multi_sign -> "PLUS" +multi_sign -> "SUB" +more_multi_expr -> 蔚 + +multi_expr -> unary_expr more_unary_expr +more_unary_expr -> unary_sign unary_expr more_unary_expr +unary_sign -> "MULT" +unary_sign -> "SLASH" +unary_sign -> "PERCENT" +more_unary_expr -> 蔚 + +unary_expr -> unary_sign_2 unary_declare +unary_sign_2 -> "SUB" +unary_sign_2 -> "PLUS" +unary_sign_2 -> "TILDE" +unary_sign_2 -> 蔚 +unary_declare -> "INTEGER" +unary_declare -> "STRING" +unary_declare -> BOOLEAN_VALUE + +BOOLEAN_VALUE -> "TRUE" +BOOLEAN_VALUE -> "FALSE" \ No newline at end of file diff --git a/include/Automata.h b/include/Automata.h new file mode 100644 index 0000000..8bec142 --- /dev/null +++ b/include/Automata.h @@ -0,0 +1,118 @@ +// +// Created by Administrator on 2021/5/1. +// + +#ifndef SYNTAXPARSER_AUTOMATA_H +#define SYNTAXPARSER_AUTOMATA_H + +#include +#include +#include +#include +#include +#include +#include + +class Automata { +public: + + std::wifstream input; + + explicit Automata(const std::string& path) : input(path, std::ios::binary) { + auto* codeCvtToUTF8= new std::codecvt_utf8; + input.imbue(std::locale(input.getloc(), codeCvtToUTF8)); + } + + ~Automata() { + input.close(); + } + + void parse() { + while (!ifeof) { + tokens.push_back(nextToken()); + } + } + + void output(); + +private: + + using TokenType = enum { + /* Reserve Words */ + STRUCT, BOOLEAN, SHORT, LONG, + DOUBLE, FLOAT, + INT8, INT16, INT32, INT64, + UINT8, UINT16, UINT32, UINT64, + CHAR, + UNSIGNED, + /* Special Symbols */ + OPENING_BRACE, CLOSING_BRACE, + SEMICOLON, + LEFT_BRACKET, RIGHT_BRACKET, + MULT, PLUS, SUB, TILDE, SLASH, + PERCENT, LEFT_SHIFT, RIGHT_SHIFT, AND, INSERT, + DELIMITER, COMMA, + /* Multicharacter Tokens */ + ID, LETTER, DIGIT, UNDERLINE, T_TRUE, T_FALSE, + INTEGER, INTEGER_TYPE_SUFFIX, STRING, T_BOOLEAN, + /* None & Error & EOF */ + NONE, ERROR, T_EOF + }; + + + using StateType = enum { + START, + S_LETTER, S_UNDERLINE, S_DIGIT, + INT_0, INT_NOT_0, INT_TYPE_SUFFIX, + STRING_START, STRING_END, + S_SIGN, + DONE, + S_NONE + }; + + using ReservedWord = struct { + std::wstring str; + TokenType token; + }; + + struct TokenInfo { + + const int line; + const TokenType token; + const std::wstring str; + const StateType state; + + TokenInfo(const int line, const TokenType token, std::wstring str, StateType state) + : line(line), token(token), str(std::move(str)), state(state) {} + + }; + + FILE* fp = nullptr; + + bool ifeof = false; + + std::vector tokens; + + int line = 1; + + + const static std::vector reservedWords; + + const static std::map tokenTypeStrMap; + + const static std::map stateTypeStrMap; + + wchar_t Automata::nextChar(); + + void pushBackChar(); + + TokenInfo nextToken(); + + static TokenType reservedLookup(const std::wstring& s); + + + +}; + + +#endif //SYNTAXPARSER_AUTOMATA_H diff --git a/src/Automata.cpp b/src/Automata.cpp new file mode 100644 index 0000000..80e0315 --- /dev/null +++ b/src/Automata.cpp @@ -0,0 +1,386 @@ +// +// Created by Administrator on 2021/5/1. +// + +#include "Automata.h" + +const std::vector Automata::reservedWords = { + {L"struct", STRUCT}, + {L"boolean", BOOLEAN}, + {L"short", SHORT}, + {L"long", LONG}, + {L"double", DOUBLE}, + {L"float", FLOAT}, + {L"int8", INT8}, + {L"int16", INT16}, + {L"int32", INT32}, + {L"int64", INT64}, + {L"uint8", UINT8}, + {L"uint16", UINT16}, + {L"uint32", UINT32}, + {L"uint64", UINT64}, + {L"char", CHAR}, + {L"unsigned", UNSIGNED}, + {L"TRUE", T_TRUE}, + {L"FALSE", T_FALSE} +}; + +const std::map Automata::stateTypeStrMap = { + {START, L"START"}, + {S_LETTER, L"LETTER"}, + {S_UNDERLINE, L"UNDERLINE"}, + {S_DIGIT, L"DIGIT"}, + {INT_0, L"INT_0"}, + {INT_NOT_0, L"INT_NOT_0"}, + {INT_TYPE_SUFFIX, L"INT_TYPE_SUFFIX"}, + {STRING_START, L"STRING_START"}, + {STRING_END, L"STRING_END"}, + {DONE, L"DONE"}, + {S_SIGN, L"SIGN"} +}; + +const std::map Automata::tokenTypeStrMap = { + {STRUCT, L"STRUCT"}, + {BOOLEAN, L"BOOLEAN"}, + {SHORT, L"SHORT"}, + {LONG, L"LONG"}, + {FLOAT, L"FLOAT"}, + {DOUBLE, L"DOUBLE"}, + {INT8, L"INT8"}, + {INT16, L"INT16"}, + {INT32, L"INT32"}, + {INT64, L"INT64"}, + {UINT8, L"UINT8"}, + {UINT16, L"UINT16"}, + {UINT32, L"UINT32"}, + {UINT64, L"UINT64"}, + {CHAR, L"CHAR"}, + {UNSIGNED, L"UNSIGNED"}, + {OPENING_BRACE, L"OPENING_BRACE"}, + {CLOSING_BRACE, L"CLOSING_BRACE"}, + {SEMICOLON, L"SEMICOLON"}, + {LEFT_BRACKET, L"LEFT_BRACKET"}, + {RIGHT_BRACKET, L"RIGHT_BRACKET"}, + {MULT, L"MULT"}, + {PLUS, L"PLUS"}, + {SUB, L"SUB"}, + {TILDE, L"TILDE"}, + {SLASH, L"SLASH"}, + {PERCENT, L"PERCENT"}, + {LEFT_SHIFT, L"LEFT_SHIFT"}, + {RIGHT_SHIFT, L"RIGHT_SHIFT"}, + {AND, L"AND"}, + {INSERT, L"INSERT"}, + {DELIMITER, L"DELIMITER"}, + {COMMA, L"COMMA"}, + {ID, L"ID"}, + {LETTER, L"LETTER"}, + {DIGIT, L"DIGIT"}, + {UNDERLINE, L"UNDERLINE"}, + {T_TRUE, L"TRUE"}, + {T_FALSE, L"FALSE"}, + {INTEGER, L"INTEGER"}, + {INTEGER_TYPE_SUFFIX, L"INTEGER_TYPE_SUFFIX"}, + {T_BOOLEAN, L"BOOLEAN"}, + {STRING, L"STRING"}, + {COMMA, L"COMMA"}, + {NONE, L"NONE"}, + {ERROR, L"ERROR"}, + {T_EOF, L"EOF"} +}; + +void Automata::output() { + + std::wofstream stream(L"./tokenOut.txt", std::ios::binary | std::ios::trunc); + int temp_line = 1; + stream << "1 "; + + for (const auto& token : tokens) { + if (token.line > temp_line) { + temp_line = token.line; + stream << std::endl << temp_line << ' '; + } + + if (token.token == ERROR) { + stream << tokenTypeStrMap.find(token.token)->second << '{' << token.str << ", " << stateTypeStrMap.find(token.state)->second << '}' << ' '; + } + else { + stream << tokenTypeStrMap.find(token.token)->second << '(' << token.str << ')' << ' '; + } + + } + + stream.close(); +} + +wchar_t Automata::nextChar() { + wchar_t c; + input >> std::noskipws >> c; + if (input.eof()) { + this->ifeof = true; + } + return c; +} + +Automata::TokenInfo Automata::nextToken() { + TokenType currentToken = NONE; + StateType state = START, last_state = S_NONE; + std::wstringstream ss; + while (state != DONE) { + + last_state = state; + + wchar_t CH = nextChar(); + bool save = true; + if (ifeof) { + currentToken = T_EOF; + break; + } + switch (state) { + case START: + if (isdigit(CH)) { + if (CH == '0') { + state = INT_0; + } + else { + state = INT_NOT_0; + } + } + else if (isalpha(CH)) { + state = S_LETTER; + } + else if (CH == '\"') { + state = STRING_START; + } + else if ((CH == ' ') || (CH == '\t') || (CH == '\r')) { + save = false; + } + else if (CH == '\n') { + this->line++; + save = false; + } + else { + + state = S_SIGN; + switch (CH) { + case '{': + currentToken = OPENING_BRACE; + break; + case '}': + currentToken = CLOSING_BRACE; + break; + case ';': + currentToken = SEMICOLON; + break; + case '[': + currentToken = LEFT_BRACKET; + break; + case ']': + currentToken = RIGHT_BRACKET; + break; + case '*': + currentToken = MULT; + break; + case '+': + currentToken = PLUS; + break; + case '-': + currentToken = SUB; + break; + case '~': + currentToken = TILDE; + break; + case '/': + currentToken = SLASH; + break; + case '%': + currentToken = PERCENT; + break; + case '>': + CH = nextChar(); + if (CH == '>') { + currentToken = RIGHT_SHIFT; + ss << CH; + } + else { + pushBackChar(); + currentToken = ERROR; + } + break; + case '<': + CH = nextChar(); + if (CH == '<') { + currentToken = LEFT_SHIFT; + ss << CH; + } + else { + pushBackChar(); + currentToken = ERROR; + } + break; + case '&': + currentToken = AND; + break; + case '^': + currentToken = INSERT; + break; + case '|': + currentToken = DELIMITER; + break; + case ',': + currentToken = COMMA; + break; + default: + currentToken = ERROR; + break; + } + } + break; + + case S_LETTER: + if (CH == '_') { + state = S_UNDERLINE; + } + else if (isdigit(CH)) { + state = S_DIGIT; + } + else if (isalpha(CH)) { + state = S_LETTER; + } + else { + currentToken = ID; + pushBackChar(); + state = DONE; + save = false; + } + break; + + case S_DIGIT: + if (isalpha(CH)) { + state = S_LETTER; + } else if (isdigit(CH)) { + state = S_DIGIT; + } + else { + currentToken = ID; + pushBackChar(); + state = DONE; + save = false; + } + break; + + case S_UNDERLINE: + if (isdigit(CH)) { + state = S_DIGIT; + } else if (isalpha(CH)) { + state = S_LETTER; + } + else { + pushBackChar(); + currentToken = ERROR; + save = false; + } + break; + + case INT_0: + if (CH == 'l' || CH == 'L') { + state = INT_TYPE_SUFFIX; + } + else { + currentToken = INTEGER; + pushBackChar(); + state = DONE; + save = false; + } + break; + + case INT_NOT_0: + if (CH == 'l' || CH == 'L') { + state = INT_TYPE_SUFFIX; + } + else if (isdigit(CH)) { + state = INT_NOT_0; + } + else { + currentToken = INTEGER; + pushBackChar(); + state = DONE; + save = false; + } + break; + + case INT_TYPE_SUFFIX: + state = DONE; + currentToken = INTEGER; + pushBackChar(); + save = false; + break; + + case STRING_START: + if (CH == '\\') { + wchar_t buff_c = CH; + CH = nextChar(); + + if ((CH == 'b') || (CH == 't') || (CH == 'n') || (CH == 'f') || (CH == 'r') + || (CH == '"') || (CH == '\\')) { + ss << buff_c; + state = STRING_START; + } + else { + pushBackChar(); + currentToken = ERROR; + } + } + else if (CH == '\"') { + state = STRING_END; + } + else if (CH == ' ') { + ss << '\\' << '4'; + CH = '0'; + state = STRING_START; + } + else { + state = STRING_START; + } + break; + + case STRING_END: + state = DONE; + currentToken = STRING; + pushBackChar(); + save = false; + break; + + case S_SIGN: + state = DONE; + pushBackChar(); + save = false; + break; + } + + if (save) { + ss << CH; + } + + if (state == DONE) { + const std::wstring token = ss.str(); + if (currentToken == ID) { + currentToken = reservedLookup(token); + } + } + + } + + return TokenInfo(this->line, currentToken, ss.str(), last_state); +} + +Automata::TokenType Automata::reservedLookup(const std::wstring &s) { + for (const auto& word : reservedWords) + if (word.str == s) + return word.token; + return ID; +} + +void Automata::pushBackChar() { + input.seekg(-1L, std::ios::cur); +} diff --git a/src/SyntaxParser.cpp b/src/SyntaxParser.cpp index 0ec11e1..a10e370 100644 --- a/src/SyntaxParser.cpp +++ b/src/SyntaxParser.cpp @@ -110,11 +110,7 @@ void SyntaxParser::getToken() { for(int i = 1; i < tokens.size(); i++) { if(tokens[i] == L"\r") continue;; auto token_info = get_token_info(tokens[i]); - int symbol_index; - - symbol_index = pool->getSymbolIndex(token_info.first); - - tokens_queue.push(symbol_index); + tokens_queue.push(pool->getSymbolIndex(token_info.first)); _line_index++; }