scanner.cc

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2008 Kamil Dudka <xdudka00@stud.fit.vutbr.cz>
00003  *
00004  * This file is part of vyp08 (compiler and interpreter of VYP08 language).
00005  *
00006  * vyp08 is free software: you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation, either version 3 of the License, or
00009  * any later version.
00010  *
00011  * vyp08 is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with vyp08.  If not, see <http://www.gnu.org/licenses/>.
00018  */
00019 
00020 #include "config.h"
00021 #include "scanner.h"
00022 
00023 #include "vypIO.h"
00024 
00025 #ifndef BUILDING_DOX
00026 #   include <boost/regex.hpp>
00027 #   include <FlexLexer.h>
00028 #   include <map>
00029 #   include <sstream>
00030 #endif
00031 
00032 using namespace StreamDecorator;
00033 using std::string;
00034 
00040 class PrivateFlexLexer: public yyFlexLexer {
00041     public:
00042         PrivateFlexLexer(std::istream &input, string fileName):
00043             yyFlexLexer(&input, &std::cerr),
00044             fileName_(fileName),
00045             hasError_(false)
00046         {
00047         }
00049         bool hasError() const {
00050             return hasError_;
00051         }
00053         EToken readNext() {
00054             return static_cast<EToken>
00055                 (this->yylex());
00056         }
00058         void postError(const char *msg) {
00059             this->LexerError(msg);
00060         }
00061     protected:
00063         virtual void LexerOutput(const char *buf, int size) {
00064             string msg(buf, size);
00065             this->LexerError(msg.c_str());
00066         }
00068         virtual void LexerError(const char *msg) {
00069             this->hasError_ = true;
00070             std::ostream &str = *(this->yyout);
00071             str << Error(E_ERROR, fileName_, msg, lineno(), "lexical error")
00072                 << std::endl;
00073         }
00074     private:
00075         string  fileName_;                              
00076         bool    hasError_;                              
00077 };
00078 
00082 class FlexScanner: public IScanner {
00083     public:
00084         FlexScanner(std::istream &input, string fileName) {
00085             flex_ = new PrivateFlexLexer(input, fileName);
00086         }
00087         virtual ~FlexScanner() {
00088             delete flex_;
00089         }
00090         virtual bool readNext(Token &token);
00091         virtual bool hasError() const {
00092             return flex_->hasError();
00093         }
00094     private:
00095         PrivateFlexLexer *flex_;                        
00096 };
00097 
00102 class KwScanner: public IScanner {
00103     public:
00104         KwScanner(IScanner *scannerToUseAndDelete):
00105             scan_(scannerToUseAndDelete)
00106         {
00107             initMap();
00108         }
00109         virtual ~KwScanner() {
00110             delete scan_;
00111         }
00112         virtual bool readNext(Token &token);
00113         virtual bool hasError() const {
00114             return scan_->hasError();
00115         }
00116     private:
00117         typedef STD_MAP(string, EToken) TMap;
00118         TMap map_;                                      
00119 
00120         IScanner *scan_;                                
00121         void initMap();
00122 };
00123 
00124 #ifndef BUILDING_DOX
00125 namespace {
00126 #endif
00127 
00133     template <typename T> bool readNumber(const string &s, T &number) {
00134         std::istringstream str(s);
00135         str >> number;
00136         return str;
00137     }
00138 
00146     bool readString(const string &in, string &out) {
00147         const boost::regex reString("^\"(.*)\"$");
00148         boost::smatch result;
00149         if (!boost::regex_match(in, result, reString))
00150             // invalid string
00151             return false;
00152         string tmp(result[1]);
00153         out.clear();
00154         for (const char *szTmp = tmp.c_str(); *szTmp; ++szTmp) {
00155             if (static_cast<unsigned char>(*szTmp) < 32)
00156                 // char with ASCII value not greater than 31
00157                 return false;
00158 
00159             if (*szTmp != '\\') {
00160                 out.push_back(*szTmp);
00161                 continue;
00162             }
00163             if (!*(++szTmp))
00164                 // back slash with no successor
00165                 return false;
00166 
00167             switch (*szTmp) {
00168                 case 'n':   out.push_back('\n');    break;
00169                 case '\\':  out.push_back('\\');    break;
00170                 case '"':   out.push_back('"');     break;
00171                 default:
00172                             // invalid escape sequence
00173                             return false;
00174             }
00175         }
00176         return true;
00177     }
00178 #ifndef BUILDING_DOX
00179 } // namespace
00180 #endif
00181 
00182 std::ostream& operator<<(std::ostream &str, EToken type) {
00183     str << Color(C_YELLOW);
00184     switch (type) {
00185         case ETOKEN_NULL:           str << "T_NULL";        break;
00186         case ETOKEN_ID:             str << "T_ID";          break;
00187         case ETOKEN_NUMBER_INT:     str << "T_INT";         break;
00188         case ETOKEN_NUMBER_DOUBLE:  str << "T_DOUBLE";      break;
00189         case ETOKEN_STRING:         str << "T_STRING";      break;
00190         case ETOKEN_OP_LCBR:
00191         case ETOKEN_OP_RCBR:
00192         case ETOKEN_OP_LPAR:
00193         case ETOKEN_OP_RPAR:
00194         case ETOKEN_OP_STAR:
00195         case ETOKEN_OP_SLASH:
00196         case ETOKEN_OP_PLUS:
00197         case ETOKEN_OP_MINUS:
00198         case ETOKEN_OP_LESS:
00199         case ETOKEN_OP_LESS_EQ:
00200         case ETOKEN_OP_GREATER:
00201         case ETOKEN_OP_GREATER_EQ:
00202         case ETOKEN_OP_ASSIGN:
00203         case ETOKEN_OP_COMMA:
00204         case ETOKEN_OP_SEMICOLON:
00205         case ETOKEN_KW_AND:
00206         case ETOKEN_KW_DIV:
00207         case ETOKEN_KW_EQ:
00208         case ETOKEN_KW_OR:
00209         case ETOKEN_KW_NEQ:
00210         case ETOKEN_KW_NOT:         str << "T_OP";          break;
00211         case ETOKEN_KW_DOUBLE:
00212         case ETOKEN_KW_ELSE:
00213         case ETOKEN_KW_IF:
00214         case ETOKEN_KW_INT:
00215         case ETOKEN_KW_STRING:
00216         case ETOKEN_KW_VAR:
00217         case ETOKEN_KW_VOID:
00218         case ETOKEN_KW_WHILE:       str << "T_KEYWORD";     break;
00219     }
00220     str << Color(C_NO_COLOR);
00221     switch (type) {
00222         case ETOKEN_NULL:
00223         case ETOKEN_ID:
00224         case ETOKEN_NUMBER_INT:
00225         case ETOKEN_NUMBER_DOUBLE:
00226         case ETOKEN_STRING:
00227             break;
00228         default:
00229             str << "[" << Color(C_LIGHT_BLUE);
00230             switch (type) {
00231                 case ETOKEN_NULL:
00232                 case ETOKEN_ID:
00233                 case ETOKEN_NUMBER_INT:
00234                 case ETOKEN_NUMBER_DOUBLE:
00235                 case ETOKEN_STRING:
00236                     break;
00237                 case ETOKEN_OP_LCBR:            str << "{";         break;
00238                 case ETOKEN_OP_RCBR:            str << "}";         break;
00239                 case ETOKEN_OP_LPAR:            str << "(";         break;
00240                 case ETOKEN_OP_RPAR:            str << ")";         break;
00241                 case ETOKEN_OP_STAR:            str << "*";         break;
00242                 case ETOKEN_OP_SLASH:           str << "/";         break;
00243                 case ETOKEN_OP_PLUS:            str << "+";         break;
00244                 case ETOKEN_OP_MINUS:           str << "-";         break;
00245                 case ETOKEN_OP_LESS:            str << "<";         break;
00246                 case ETOKEN_OP_LESS_EQ:         str << "<=";        break;
00247                 case ETOKEN_OP_GREATER:         str << ">";         break;
00248                 case ETOKEN_OP_GREATER_EQ:      str << ">=";        break;
00249                 case ETOKEN_OP_ASSIGN:          str << ":=";        break;
00250                 case ETOKEN_OP_COMMA:           str << ",";         break;
00251                 case ETOKEN_OP_SEMICOLON:       str << ";";         break;
00252                 case ETOKEN_KW_AND:             str << "and";       break;
00253                 case ETOKEN_KW_DIV:             str << "div";       break;
00254                 case ETOKEN_KW_EQ:              str << "eq";        break;
00255                 case ETOKEN_KW_OR:              str << "or";        break;
00256                 case ETOKEN_KW_NEQ:             str << "neq";       break;
00257                 case ETOKEN_KW_NOT:             str << "not";       break;
00258                 case ETOKEN_KW_DOUBLE:          str << "double";    break;
00259                 case ETOKEN_KW_ELSE:            str << "else";      break;
00260                 case ETOKEN_KW_IF:              str << "if";        break;
00261                 case ETOKEN_KW_INT:             str << "int";       break;
00262                 case ETOKEN_KW_STRING:          str << "string";    break;
00263                 case ETOKEN_KW_VAR:             str << "var";       break;
00264                 case ETOKEN_KW_VOID:            str << "void";      break;
00265                 case ETOKEN_KW_WHILE:           str << "while";     break;
00266             }
00267             str << Color(C_NO_COLOR) << "]";
00268     }
00269     return str;
00270 }
00271 
00272 std::ostream& operator<<(std::ostream &str, const Token &token) {
00273     // start with lineno:token_typ:
00274     str << Color(C_LIGHT_GREEN) << token.lineno << Color(C_NO_COLOR)
00275         << ":" << token.type;
00276 
00277     // append (...) in some cases
00278     switch (token.type) {
00279         case ETOKEN_ID:
00280         case ETOKEN_STRING:
00281             str << "[" << Color(C_LIGHT_RED) << token.text << Color(C_NO_COLOR) << "]";
00282             break;
00283         case ETOKEN_NUMBER_INT:
00284             str << "[" << Color(C_LIGHT_RED) << token.numberInt << Color(C_NO_COLOR) << "]";
00285             break;
00286         case ETOKEN_NUMBER_DOUBLE:
00287             str << "[" << Color(C_LIGHT_RED) << token.numberDouble << Color(C_NO_COLOR) << "]";
00288             break;
00289         default:
00290             break;
00291     }
00292     str << Color(C_NO_COLOR);
00293     return str;
00294 }
00295 
00296 IScanner* ScannerFactory::createScanner(std::istream &input, std::string fileName) {
00297     // construct decorator chain
00298     // ATTENTION: KwScanner object is responsible to destroy FlexScanner object
00299     return new KwScanner(
00300         new FlexScanner(input, fileName));
00301 }
00302 
00303 bool FlexScanner::readNext(Token &token) {
00304     EToken type;
00305     while ((type = flex_->readNext())) {
00306         // common part for all tokens
00307         token.type      = type;
00308         token.lineno    = flex_->lineno();
00309 
00310         // token-specific actions
00311         switch (type) {
00312             case ETOKEN_ID:
00313                 token.text = flex_->YYText();
00314                 return true;
00315             case ETOKEN_NUMBER_INT:
00316                 if (readNumber(flex_->YYText(), token.numberInt))
00317                     return true;
00318                 else
00319                     flex_->postError("invalid integral literal");
00320                 break;
00321             case ETOKEN_NUMBER_DOUBLE:
00322                 if (readNumber(flex_->YYText(), token.numberDouble))
00323                     return true;
00324                 else
00325                     flex_->postError("invalid decimal literal");
00326                 break;
00327             case ETOKEN_STRING:
00328                 if (readString(flex_->YYText(), token.text))
00329                     return true;
00330                 else
00331                     flex_->postError("invalid string literal");
00332                 break;
00333             default:
00334                 return true;
00335         } // switch (type)
00336     } // while ((type = flex_->readNext()))
00337     return false;
00338 }
00339 
00340 bool KwScanner::readNext(Token &token) {
00341     if (!scan_->readNext(token))
00342         // no token from input
00343         return false;
00344 
00345     if (token.type != ETOKEN_ID)
00346         // can't be keyword
00347         return true;
00348 
00349     TMap::const_iterator i = map_.find(token.text);
00350     if (i != map_.end())
00351         // keyword match
00352         token.type = i->second;
00353 
00354     return true;
00355 }
00356 
00358 void KwScanner::initMap() {
00359     // initialize keyword->token mapping
00360     map_["and"]     = ETOKEN_KW_AND;
00361     map_["div"]     = ETOKEN_KW_DIV;
00362     map_["double"]  = ETOKEN_KW_DOUBLE;
00363     map_["else"]    = ETOKEN_KW_ELSE;
00364     map_["eq"]      = ETOKEN_KW_EQ;
00365     map_["if"]      = ETOKEN_KW_IF;
00366     map_["int"]     = ETOKEN_KW_INT;
00367     map_["neq"]     = ETOKEN_KW_NEQ;
00368     map_["not"]     = ETOKEN_KW_NOT;
00369     map_["or"]      = ETOKEN_KW_OR;
00370     map_["string"]  = ETOKEN_KW_STRING;
00371     map_["var"]     = ETOKEN_KW_VAR;
00372     map_["void"]    = ETOKEN_KW_VOID;
00373     map_["while"]   = ETOKEN_KW_WHILE;
00374 }

Generated on Sat Jul 4 18:32:59 2009 for vyp08 (compiler and interpreter of VYP08 language) by  doxygen 1.5.4