From 70ac1bb025ae2653fc20dce7c832a0ca88d8194d Mon Sep 17 00:00:00 2001 From: satunreric Date: Thu, 29 Apr 2021 18:55:04 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E6=B7=BB=E5=8A=A0=E7=9B=B8?= =?UTF-8?q?=E5=85=B3=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 2 +- cmake-build-debug/syntaxInput.txt | 11 +- main.cpp | 853 +++++++++++++++++++++++------- 3 files changed, 664 insertions(+), 202 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed593fd..cbf011b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.17) project(syntaxParser) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 20) add_executable(syntaxParser main.cpp) \ No newline at end of file diff --git a/cmake-build-debug/syntaxInput.txt b/cmake-build-debug/syntaxInput.txt index 3e37a3e..84aafd0 100644 --- a/cmake-build-debug/syntaxInput.txt +++ b/cmake-build-debug/syntaxInput.txt @@ -1,8 +1,3 @@ -@E -> T E' -E' -> "+" T E' -E' -> ε -T -> F T' -T' -> "*" F T' -T' -> ε -F -> "(" @E ")" -F -> "id" +@S -> C C +C -> "c" C +C -> "d" \ No newline at end of file diff --git a/main.cpp b/main.cpp index 17b19aa..861781f 100644 --- a/main.cpp +++ b/main.cpp @@ -8,6 +8,7 @@ #include #include #include +#include using namespace std; @@ -19,13 +20,14 @@ using std::wcout; using std::endl; using std::to_string; using std::hash; +using std::setw; struct Symbol { const int index; - const wstring name; - const bool terminator; - const bool start; + wstring name; + bool terminator; + bool start; Symbol(int index, wstring name, bool terminator, bool start): index(index), @@ -60,7 +62,7 @@ public: line.push_back(symbol); } - const vector &getAllSymbols() { + const vector &getAllSymbols() const { return line; } @@ -89,7 +91,7 @@ public: return symbol->index; } - const Symbol *getSymbol(int symbol_index) { + const Symbol *getSymbol(int symbol_index) const { const auto &it = cache.find(symbol_index); if(it != cache.end()) { return it->second; @@ -98,7 +100,7 @@ public: } } - int getSymbolIndex(const wstring &name) { + int getSymbolIndex(const wstring &name) const { const auto &it = table.find(name); if(it != table.end()) { return it->second->index; @@ -107,6 +109,28 @@ public: } } + void modifySymbol(int idx, const wstring &name, bool terminator, bool start) { + auto it = cache.find(idx); + if(it != cache.end()) { + auto p_sym = it->second; + p_sym->name = name; + p_sym->terminator = terminator; + p_sym->start = start; + } + } + + const Symbol *getStartSymbol() const { + for(const auto & symbol : getAllSymbols()) { + if(symbol->start) { + return symbol; + } + } + + throw runtime_error("start symbol NOT Found"); + } + + + }; // 产生式 @@ -118,137 +142,21 @@ struct Production { }; -// 项 -class Item{ - // 对应的产生式 - const Production* const production; - - // 点的位置 - int dot_index = 0; - - const int terminator = 0; - - -public: - explicit Item(Production *p_pdt, int m_terminator) : production(p_pdt), terminator(m_terminator) {} - - void set_dot_index(int m_dot_index) { - if(m_dot_index > production->right.size()) { - throw runtime_error("DOT_INDEX out of range"); - } - this->dot_index = m_dot_index; - } - - int get_dot_index() const { - return dot_index; - } - - int get_dot_next_symbol() { - if(get_dot_index() == production->right.size()) { - return 0; - } else { - return production->right[dot_index]; - } - } -}; - -class ItemCollectionManager; - -class ItemCollection{ - - int index = 0; - - vector items; - - Production *G; - - friend ItemCollectionManager; - -public: - - void addItem(Production *p_pdt, int dot_index, int terminator) { - auto *p_item = new Item(p_pdt, terminator); - p_item->set_dot_index(dot_index); - items.push_back(p_item); - } - - void CLOSURE() { - bool ifAdd = true; - - while(ifAdd) { - ifAdd = false; - - for(const auto & item : items) { - if(item->get_dot_next_symbol() == 0) { - continue; - } - // TODO - - } - - } - } - -}; - -class ItemCollectionManager{ - - int index = 1; - - map ic_map; - - vector ics; - - template - inline void hash_combine(std::size_t& seed, const T& v) - { - std::hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); - } - -public: - - const vector &getItemCollections() { - return ics; - } - - bool addItemCollection(int idx, int terminator, ItemCollection *p_ic){ - auto hasher = hash(); - size_t seed = hasher(idx); - hash_combine(seed, terminator); - - auto it = ic_map.find(seed); - if(it != ic_map.end()) { - return false; - } else { - p_ic->index = this->index++; - ic_map.insert(pair(seed, p_ic)); - ics.push_back(p_ic); - return true; - } - } - -}; - -class Generator{ - - // 文件输入 - wifstream input; +// 语法资源池 +class GrammarResourcePool { // 符号表 SymbolTable symbolTable; // 产生式 - vector productions; + vector productions; // FIRST结果存储表 - map *> firsts; + map *> firsts; // FOLLOW结果存储表 map *> follows; - map C; - // 去掉首尾空格 static wstring& trim(wstring &&str) { if (str.empty()) { @@ -260,7 +168,9 @@ class Generator{ return str; } - set *FIRST(const vector &symbols, int start_index) { +public: + + const set *FIRST(const vector &symbols, int start_index) { // 生成集合 auto *non_terminator_symbols = new set(); @@ -282,7 +192,7 @@ class Generator{ return non_terminator_symbols; } - set* FIRST(int symbol) { + const set* FIRST(int symbol) { // 查找缓存 const auto it = firsts.find(symbol); @@ -302,7 +212,7 @@ class Generator{ // 遍历每一产生式 for (const auto &production : productions) { - Production *p_pdt = production; + const Production *p_pdt = production; if (p_pdt->left != symbol) continue; @@ -328,7 +238,7 @@ class Generator{ if (!production_found) non_terminator_symbols->insert(0); } - this->firsts.insert(pair *>(symbol, non_terminator_symbols)); + this->firsts.insert(pair *>(symbol, non_terminator_symbols)); return non_terminator_symbols; } @@ -471,84 +381,641 @@ class Generator{ wcout << L"}" << endl; } + void parse_production_string_line(const wstring &temp_line) { + auto middle_index = temp_line.find(L"->", 0); + + + if(middle_index == string::npos) { + throw runtime_error("-> NOT FOUND"); + } + + wstring front = trim(temp_line.substr(0, middle_index)); + int left = symbolTable.addSymbol(front, false); + + wstring back = trim(temp_line.substr(middle_index + 2, temp_line.size() - middle_index - 2)); + + wstringstream terminator, non_terminator; + vector symbols; + bool is_terminator = false; + for(const auto &c : back) { + if (c == L'\"') { + if(is_terminator) { + symbols.push_back(symbolTable.addSymbol(trim(terminator.str()), true)); + terminator.str(L""); + terminator.clear(); + } + is_terminator = !is_terminator; + continue; + } + if(c == L' ' || c == L'\r') { + wstring temp_symbol = trim(non_terminator.str()); + if(!temp_symbol.empty()) { + symbols.push_back(symbolTable.addSymbol(trim(non_terminator.str()), false)); + non_terminator.str(L""); + non_terminator.clear(); + } + continue; + } + if(is_terminator) { + terminator << c; + } else { + non_terminator << c; + } + } + + auto p_pdt = new Production(left, symbols); + + productions.push_back(p_pdt); + } + + const vector &get_productions() const { + return productions; + } + + const Symbol *getSymbol(int symbol_index) const { + return symbolTable.getSymbol(symbol_index); + } + + const Symbol *getStartSymbol() const { + return symbolTable.getStartSymbol(); + } + + int addSymbol(const wstring &name, bool terminator) { + return symbolTable.addSymbol(name, terminator); + } + + const Production *addProduction(int left, initializer_list right) { + vector right_vector; + for(int symbol : right) { + right_vector.push_back(symbol); + } + auto p_pdt = new Production(left, right_vector); + productions.push_back(p_pdt); + return p_pdt; + } + + const vector &getAllSymbols() const { + return symbolTable.getAllSymbols(); + } + + void modifySymbol(int index, const wstring &name, bool terminator, bool start) { + symbolTable.modifySymbol(index, name, terminator, start); + } +}; + +// 项 +class Item{ + // 对应的产生式 + const Production* const production; + + // 点的位置 + int dot_index = 0; + + const int terminator = 0; + public: - Generator(): input("syntaxInput.txt", std::ios::binary) { + const bool generated = false; + + explicit Item(const Production *p_pdt, int m_terminator, bool m_generated = false) + : production(p_pdt), terminator(m_terminator), generated(m_generated) {} + + void set_dot_index(int m_dot_index) { + if(m_dot_index > production->right.size()) { + throw runtime_error("DOT_INDEX out of range"); + } + this->dot_index = m_dot_index; + } + + int get_dot_index() const { + return dot_index; + } + + int get_dot_next_symbol() const { + if(get_dot_index() == production->right.size()) { + return 0; + } else { + return production->right[dot_index]; + } + } + + int get_dot_next_i_symbol(int i) const { + if(get_dot_index() + i >= production->right.size()) { + return 0; + } else { + return production->right[dot_index + i]; + } + } + + int get_terminator() const { + return terminator; + } + + const Production *get_production() const { + return production; + } +}; + +class ItemCollectionManager; + +class ItemCollection{ + + int index = 0; + + map items; + + vector cache; + + GrammarResourcePool *pool; + + friend ItemCollectionManager; + + template + inline void hash_combine(std::size_t& seed, const T& v) const + { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); + } + +public: + + explicit ItemCollection(GrammarResourcePool *pool) : pool(pool) { + + } + + const vector &getItems() const { + return cache; + } + + int getIndex() const { + return index; + } + + + bool addItem(const Production *p_pdt, int dot_index, int terminator, bool generated = false) { + auto hasher = hash(); + size_t seed = hasher(reinterpret_cast(p_pdt)); + hash_combine(seed, dot_index); + hash_combine(seed, terminator); + + auto it = items.find(seed); + if(it != items.end()) { + return false; + } + + auto *p_item = new Item(p_pdt, terminator, generated); + p_item->set_dot_index(dot_index); + items.insert(pair(seed, p_item)); + cache.push_back(p_item); + + return true; + } + + void CLOSURE() { + + bool ifAdd = true; + + while(ifAdd) { + ifAdd = false; + + for(const auto & item : items) { + int next_symbol = item.second->get_dot_next_symbol(); + + if(next_symbol == 0 + || pool->getSymbol(next_symbol)->terminator) { + continue; + } + + for(auto *production : pool->get_productions()) { + if(production->left == next_symbol) { + vector first_args; + first_args.push_back(item.second->get_dot_next_i_symbol(1)); + first_args.push_back(item.second->get_terminator()); + + const auto first_set = pool->FIRST(first_args, 0); + for(auto terminator : *first_set) { + if(terminator == 0) continue; + if(this->addItem(production, 0, terminator, true)) { + ifAdd = true; + } + } + } + } + + } + + } + } + + void print() const { + + wcout << L"I" << index << L": "; + + for(const auto item : cache) { + const auto *p_pdt = item->get_production(); + int dot_index = item->get_dot_index(); + wcout << pool->getSymbol(p_pdt->left)->name << L" -> " ; + int i = 0; + for(const auto &symbol : p_pdt->right) { + if(i++ == dot_index) cout << "·"; + wcout << pool->getSymbol(symbol)->name; + } + + if(i++ == dot_index) cout << "·"; + + wcout << L"," << pool->getSymbol(item->get_terminator())->name << endl; + } + cout << endl; + } + + size_t getHash() const { + size_t seed = 0; + for(const auto item : cache) { + + if(item->generated) { + continue; + } + + hash_combine(seed, item->get_production()); + hash_combine(seed, item->get_dot_index()); + hash_combine(seed, item->get_terminator()); + } + return seed; + } +}; + +class ItemCollectionManager{ + + int index = 0; + + map ic_map; + + map ic_content_map; + + vector ics; + + GrammarResourcePool *pool; + + const Production *start_pdt; + + template + inline void hash_combine(std::size_t& seed, const T& v) const + { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); + } + +public: + + explicit ItemCollectionManager(GrammarResourcePool *resource_pool) : pool(resource_pool) { + + } + + void buildItems() { + + const auto startSymbol = pool->getStartSymbol(); + + wstring new_symbol_name = startSymbol->name + L"'"; + + int new_symbol_index = pool->addSymbol(new_symbol_name, startSymbol->terminator); + + pool->modifySymbol(startSymbol->index, startSymbol->name.substr(1), false, false); + + const auto *p_pdt = pool->addProduction(new_symbol_index, {startSymbol->index}); + + this->start_pdt = p_pdt; + + auto *pi_ic = new ItemCollection(pool); + + // -1 代表 $ + pi_ic->addItem(p_pdt, 0, -1); + + pi_ic->CLOSURE(); + + addItemCollection(0, 0, pi_ic); + + bool ifAdd = true; + + while(ifAdd) { + + ifAdd = false; + const auto &r_ics = getItemCollections(); + vector temp_ics(r_ics.begin(), r_ics.end()); + for(const auto ic : temp_ics) { + for(const auto symbol : pool->getAllSymbols()) { + if(symbol->index <= 0) { + continue; + } + if(GOTO(ic, symbol->index)) { + ifAdd = true; + } + } + } + + } + + } + + const Production *getStartProduction() const { + return start_pdt; + } + + const vector &getItemCollections() const{ + return ics; + } + + ItemCollection *getItemCollectionByHash(size_t hash) { + ItemCollection *p_ic = nullptr; + auto it = ic_content_map.find(hash); + if(it != ic_content_map.end()) { + p_ic = it->second; + } + return p_ic; + } + + bool addItemCollection(int idx, int symbol, ItemCollection *p_ic){ + + size_t ic_hash = p_ic->getHash(); + auto it = ic_content_map.find(ic_hash); + if (it != ic_content_map.end()) { + p_ic = it->second; + } else { + p_ic->index = this->index++; + ic_content_map.insert(pair(ic_hash, p_ic)); + ics.push_back(p_ic); + } + + auto hasher = hash(); + size_t seed = hasher(idx); + hash_combine(seed, symbol); + + auto it2 = ic_map.find(seed); + if(it2 != ic_map.end()) { + return false; + } + + if(symbol != 0) + wcout << L"GOTO(" << idx << L", " << pool->getSymbol(symbol)->name << L")" << endl; + + ic_map.insert(pair(seed, p_ic)); + p_ic->print(); + return true; + + } + + const ItemCollection* getGOTO(int idx, int symbol) const { + + auto hasher = hash(); + size_t seed = hasher(idx); + hash_combine(seed, symbol); + + auto it = ic_map.find(seed); + if(it != ic_map.end()) { + return it->second; + } else { + return nullptr; + } + } + + bool GOTO(const ItemCollection *p_ic, int symbol) { + auto *pt_ic = new ItemCollection(pool); + + for(const auto &item : p_ic->cache) { + if(item->get_dot_next_symbol() == symbol) { + pt_ic->addItem(item->get_production(), item->get_dot_index() + 1, item->get_terminator()); + } + } + auto p_temp_ic = this->getItemCollectionByHash(pt_ic->getHash()); + if(p_temp_ic == nullptr) + pt_ic->CLOSURE(); + else pt_ic = p_temp_ic; + + if(!pt_ic->items.empty()) { + return this->addItemCollection(p_ic->index, symbol, pt_ic); + } else { + return false; + } + + } + +}; + +class AnalyseTableGenerator { + + using Action = enum { + MOVE, STATUTE, ACC, STEP_GOTO + }; + + struct Step { + + const Action action; + const union Target{ + int index; + const Production *production; + } target; + + Step(Action action, int index) : action(action), target(Target{index}){} + Step(Action action, const Production *p_pdt) : action(action), target(Target{.production = p_pdt}){} + }; + + map ACTION; + + map GOTO; + + const ItemCollectionManager *icm; + + const GrammarResourcePool *pool; + + template + inline void hash_combine(std::size_t& seed, const T& v) const + { + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); + } + + void add_action(int index, int terminator_symbol, Action action, const Production *target_pdt) { + size_t seed = 0; + hash_combine(seed, index); + hash_combine(seed, terminator_symbol); + + auto it = ACTION.find(seed); + if(it == ACTION.end()) { + auto step = new Step(action, target_pdt); + ACTION.insert(pair(seed, step)); + } else { + if(it->second->action != action || it->second->target.production != target_pdt) + throw runtime_error("Conflict Occurred, Syntax NOT LR(1)"); + } +} + +void add_action(int index, int terminator_symbol, Action action, int target_index) { + size_t seed = 0; + hash_combine(seed, index); + hash_combine(seed, terminator_symbol); + + auto it = ACTION.find(seed); + if(it == ACTION.end()) { + auto step = new Step(action, target_index); + ACTION.insert(pair(seed, step)); + } else { + if(it->second->action != action || it->second->target.index != target_index) + throw runtime_error("Conflict Occurred, Syntax NOT LR(1)"); + } + } + + void add_goto(int index, int non_terminator_symbol, int target_index) { + size_t seed = 0; + hash_combine(seed, index); + hash_combine(seed, non_terminator_symbol); + + auto it = GOTO.find(seed); + if(it == GOTO.end()) { + auto step = new Step(STEP_GOTO, target_index); + GOTO.insert(pair(seed, step)); + } else { + if(it->second->target.index != target_index) + throw runtime_error("Conflict Occurred, Syntax NOT LR(1)"); + } + } + +public: + + explicit AnalyseTableGenerator(const GrammarResourcePool *p_pool, const ItemCollectionManager *p_icm) + :pool(p_pool) , icm(p_icm) { + + } + + void generate() { + const auto &ics = icm->getItemCollections(); + for(const auto *ic : ics) { + for(const auto *item : ic->getItems()) { + if(item->get_production() == icm->getStartProduction() + && item->get_dot_next_symbol() == 0 + && item->get_terminator() == -1) { + this->add_action(ic->getIndex(), -1, ACC, 0); + } + int next_symbol = item->get_dot_next_symbol(); + if(next_symbol != 0) { + const auto *p_ic = icm->getGOTO(ic->getIndex(), next_symbol); + if(pool->getSymbol(next_symbol)->terminator) { + if (p_ic != nullptr) { + this->add_action(ic->getIndex(), next_symbol, MOVE, p_ic->getIndex()); + } + } else { + if (p_ic != nullptr) { + this->add_goto(ic->getIndex(), next_symbol, p_ic->getIndex()); + } + } + } else { + if(item->get_production()->left != pool->getStartSymbol()->index) { + this->add_action(ic->getIndex(), next_symbol, STATUTE, item->get_production()); + } + } + } + } + } + + const Step *findActionStep(int index, int terminator_symbol) { + size_t seed = 0; + hash_combine(seed, index); + hash_combine(seed, terminator_symbol); + + auto it = ACTION.find(seed); + if(it != ACTION.end()) { + return it->second; + } else { + return nullptr; + } + } + + const Step *findGotoStep(int index, int non_terminator_symbol) { + size_t seed = 0; + hash_combine(seed, index); + hash_combine(seed, non_terminator_symbol); + + auto it = GOTO.find(seed); + if (it != GOTO.end()) { + return it->second; + } else { + return nullptr; + } + } + + void print() { + wcout << L"ACTION" << endl; + vector symbols; + + wcout << std::left << std::setw(4) << " "; + for(const auto *symbol : pool->getAllSymbols()) { + if(symbol->index == 0) continue; + if(symbol->terminator) { + wcout << std::left << std::setw(4) << symbol->name; + symbols.push_back(symbol->index); + } + } + wcout << endl; + + for(int i = 0; i < icm->getItemCollections().size(); i++){ + wcout << std::left << std::setw(4) << i; + for(int symbol : symbols) { + auto p_step = this->findActionStep(i, symbol); + if(p_step == nullptr) { + wcout << std::left << std::setw(4) << " "; + } else { + if(p_step->action == MOVE) + wcout << std::left << std::setw(4) << wstring(L"s") + to_wstring(p_step->target.index); + else if(p_step->action == ACC) + wcout << std::left << std::setw(4) << L"acc"; + else + wcout << std::left << std::setw(4) << L"r"; + } + } + wcout << endl; + } + } + +}; + +class Generator{ + + // 文件输入 + wifstream input; + + GrammarResourcePool *pool; + + map C; + + ItemCollectionManager *icm; + + AnalyseTableGenerator *atg; + +public: + + Generator(): input("syntaxInput.txt", std::ios::binary), + pool(new GrammarResourcePool()), + icm(new ItemCollectionManager(pool)), + atg(new AnalyseTableGenerator(pool, icm)){ auto* codeCvtToUTF8= new std::codecvt_utf8; input.imbue(std::locale(input.getloc(), codeCvtToUTF8)); } - void test() { - auto p_non_term_set = FIRST(1); - print_symbols(*p_non_term_set); - - p_non_term_set = FIRST(3); - print_symbols(*p_non_term_set); - - p_non_term_set = FIRST(6); - print_symbols(*p_non_term_set); - - FOLLOW(); - - print_symbols(*FOLLOW(2)); - print_symbols(*FOLLOW(6)); - print_symbols(*FOLLOW(3)); + void run() { + pool->FOLLOW(); + icm->buildItems(); + atg->generate(); + atg->print(); } // 得到所有的产生式 void getProductions() { - //读入文法文件 + // 读入文法文件 wstring temp_line; while (getline(input, temp_line)) { - auto middle_index = temp_line.find(L"->", 0); - - - if(middle_index == string::npos) { - throw runtime_error("-> NOT FOUND"); - } - - wstring front = trim(temp_line.substr(0, middle_index)); - int left = symbolTable.addSymbol(front, false); - - wstring back = trim(temp_line.substr(middle_index + 2, temp_line.size() - middle_index - 2)); - - wstringstream terminator, non_terminator; - vector symbols; - bool is_terminator = false; - for(const auto &c : back) { - if (c == L'\"') { - if(is_terminator) { - symbols.push_back(symbolTable.addSymbol(trim(terminator.str()), true)); - terminator.str(L""); - terminator.clear(); - } - is_terminator = !is_terminator; - continue; - } - if(c == L' ' || c == L'\r') { - wstring temp_symbol = trim(non_terminator.str()); - if(!temp_symbol.empty()) { - symbols.push_back(symbolTable.addSymbol(trim(non_terminator.str()), false)); - non_terminator.str(L""); - non_terminator.clear(); - } - continue; - } - if(is_terminator) { - terminator << c; - } else { - non_terminator << c; - } - } - - auto p_pdt = new Production(left, symbols); - - productions.push_back(p_pdt); - + pool->parse_production_string_line(temp_line); } } @@ -566,7 +1033,7 @@ int main() { generator.getProductions(); - generator.test(); + generator.run(); //输出时间 end = clock(); //结束时间 cout << endl;