/* Copyright Dave Bone 1998 - 2014 All Rights Reserved. No part of this document may be reproduced without written consent from the author. FILE: identifier.lex dates: 17 Juin 2003 Purpose: yacco2 identifier Returned: T_identifier Mod: include keyword handling that can have embeded "-" - 23 Mar. 2005 Note: "-" in Rminus_la. This protects the a-b expression in the parallel-la-boundary from being considered bad. Take 2: use a bktrk strategy. If keyword then return it else see if the suffix of "-" needs dropping to produce a shorter identifier. */ /@ @i "/usr/local/yacco2/copyright.w" @** |identifier| thread.\fbreak Does three things:\fbreak 1) build up strings starting with letters following by alphanumerics, underscore and hyphens\fbreak 2) determines whether its an identifier, keyword, or xxx-in-stbl where xxx one of rule, T.\fbreak 3) it watches for the possibility of a shortened identifier within the lookahead expression.\fbreak This is caused by the hyphen being overloaded: within the lookahead expression it is an operator. The default is the longest possible identifier string which could be a keyword. Barring this it is an identifier up to but excluding the first hyphen. This is not pure in its error checking as the string could be in error within the lookahead expression and the faulty keyword being stated checked to the shortened identifier. Well the undefined identifier should be caught when the lookahead expression is postfixed evaluated against the symbol table. To keep each drawing contained to a page, i broke the rules having many subrules like |RUPPER| into smaller rules: |RUPPER_A_M| and |RUPPER_N_Z|. An Optimization:\fbreak Bypass the PDA of the grammar and use a FA for the tail-gating characters. Dtrace exposes well optimization opportunities. 2) Optimize: rid use of string to build up symbol name: use a raw buffer. Dtrace showed its inefficiencies: strlen, memcpy, and mutex locks.\fbreak Nov. 2008.\fbreak @/ fsm (fsm-id "identifier.lex",fsm-filename identifier,fsm-namespace NS_identifier ,fsm-class Cidentifier{ user-prefix-declaration #include "yacco2_stbl.h" *** user-declaration public: int hyphen_pos_; int hyphen_idx_; CAbs_lr1_sym* hyphen_; char ddd_[1024]; int ddd_idx_; *** op hyphen_pos_ = -1; hyphen_ = 0; hyphen_idx_ = -1; ddd_idx_ = 0; ddd_[ddd_idx_] = 0; *** constructor hyphen_pos_ = -1; hyphen_ = 0; hyphen_idx_ = -1; ddd_idx_ = 0; ddd_[ddd_idx_] = 0; *** } ,fsm-version "1.0",fsm-date "17 Juin 2003",fsm-debug "false" ,fsm-comments "Yacco2 identifiers lexer with symbol table lookup.") parallel-parser ( parallel-thread-function TH_identifier *** parallel-la-boundary eolr *** ) @"/usr/local/yacco2/compiler/grammars/yacco2_T_includes.T" rules{ Ridentifier ( lhs { /@ Check if the identifier is a keyword by use of the symbol table and if so create the keyword and return it. Now return as an identifier with the prefix not containing the hyphen. @/ op using namespace NS_yacco2_T_enum; using namespace NS_yacco2_terminals; using namespace yacco2_stbl; Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; CAbs_lr1_sym* sym(0); T_sym_tbl_report_card report_card; find_sym_in_stbl(report_card,*fsm->ddd_); if(report_card.action_ == T_sym_tbl_report_card::fnd) goto item_in_stbl; // strip out - suffix as in a-b where suffix -b is bktracked if(fsm->hyphen_ != 0){// re-align to - as lookahead rule_info__.parser__->override_current_token(*fsm->hyphen_,fsm->hyphen_pos_); fsm->ddd_[fsm->hyphen_idx_] = 0; }else{ sym = new T_identifier((const char*)&fsm->ddd_); sym->set_rc(*rule_info__.parser__->start_token__,__FILE__,__LINE__); RSVP(sym); return; } find_sym_in_stbl(report_card,*fsm->ddd_);// relook up id without "-" sufx if(report_card.action_ == T_sym_tbl_report_card::fnd) goto item_in_stbl; sym = new T_identifier((const char*)&fsm->ddd_); sym->set_rc(*rule_info__.parser__->start_token__,__FILE__,__LINE__); RSVP(sym); return; item_in_stbl: if(report_card.tbl_entry_->type_ != table_entry::keyword){ // return xxx-in-stbl where xxx one of rule, T sym = report_card.tbl_entry_->symbol_; sym->set_rc(*rule_info__.parser__->start_token__,__FILE__,__LINE__); RSVP(sym); return; } kw_handling: kw_in_stbl* kw_in = (kw_in_stbl*)report_card.tbl_entry_->symbol_; CAbs_lr1_sym* kw = kw_in->keyword_in_stbl(); CAbs_lr1_sym* nkw; switch(kw->enumerated_id__){ case T_Enum::T_T_raw_characters_:{nkw = new T_raw_characters;break;} case T_Enum::T_T_lr1_constant_symbols_: {nkw = new T_lr1_constant_symbols;break;} case T_Enum::T_T_error_symbols_:{nkw = new T_error_symbols;break;} case T_Enum::T_T_eocode_:{nkw = new T_eocode;break;} case T_Enum::T_T_AD_:{nkw = new T_AD;break;} case T_Enum::T_T_AB_:{nkw = new T_AB;break;} case T_Enum::T_T_parallel_la_boundary_: {nkw = new T_parallel_la_boundary;break;} case T_Enum::T_T_arbitrator_code_:{nkw = new T_arbitrator_code;break;} case T_Enum::T_T_parallel_parser_:{nkw = new T_parallel_parser;break;} case T_Enum::T_T_parallel_thread_function_: {nkw = new T_parallel_thread_function;break;} case T_Enum::T_T_parallel_control_monitor_: {nkw = new T_parallel_control_monitor;break;} case T_Enum::T_T_fsm_:{nkw = new T_fsm;break;} case T_Enum::T_T_fsm_id_:{nkw = new T_fsm_id;break;} case T_Enum::T_T_fsm_filename_:{nkw = new T_fsm_filename;break;} case T_Enum::T_T_fsm_namespace_:{nkw = new T_fsm_namespace;break;} case T_Enum::T_T_fsm_class_:{nkw = new T_fsm_class;break;} case T_Enum::T_T_fsm_version_:{nkw = new T_fsm_version;break;} case T_Enum::T_T_fsm_date_:{nkw = new T_fsm_date;break;} case T_Enum::T_T_fsm_debug_:{nkw = new T_fsm_debug;break;} case T_Enum::T_T_fsm_comments_:{nkw = new T_fsm_comments;break;} case T_Enum::T_T_terminals_:{nkw = new T_terminals;break;} case T_Enum::T_T_enumeration_:{nkw = new T_enumeration;break;} case T_Enum::T_T_file_name_:{nkw = new T_file_name;break;} case T_Enum::T_T_name_space_:{nkw = new T_name_space;break;} case T_Enum::T_T_sym_class_:{nkw = new T_sym_class;break;} case T_Enum::T_T_rules_:{nkw = new T_rules;break;} case T_Enum::T_T_lhs_:{nkw = new T_lhs;break;} case T_Enum::T_T_user_declaration_:{nkw = new T_user_declaration;break;} case T_Enum::T_T_user_prefix_declaration_: {nkw = new T_user_prefix_declaration;break;} case T_Enum::T_T_user_suffix_declaration_: {nkw = new T_user_suffix_declaration;break;} case T_Enum::T_T_constructor_:{nkw = new T_constructor;break;} case T_Enum::T_T_destructor_:{nkw = new T_destructor;break;} case T_Enum::T_T_op_:{nkw = new T_op;break;} case T_Enum::T_T_failed_:{nkw = new T_failed;break;} case T_Enum::T_T_user_implementation_: {nkw = new T_user_implementation;break;} case T_Enum::T_T_user_imp_tbl_:{nkw = new T_user_imp_tbl;break;} case T_Enum::T_T_user_imp_sym_:{nkw = new T_user_imp_sym;break;} case T_Enum::T_T_constant_defs_:{nkw = new T_constant_defs;break;} case T_Enum::T_T_terminals_refs_:{nkw = new T_terminals_refs;break;} case T_Enum::T_T_terminals_sufx_:{nkw = new T_terminals_sufx;break;} case T_Enum::T_T_lrk_sufx_:{nkw = new T_lrk_sufx;break;} case T_Enum::T_LR1_eog_:{nkw = new LR1_eog;break;} case T_Enum::T_LR1_eolr_:{nkw = new LR1_eolr;break;} case T_Enum::T_T_NULL_:{nkw = new T_NULL;break;} } nkw->set_rc(*rule_info__.parser__->start_token__,__FILE__,__LINE__); RSVP(nkw); return; *** } ){ -> Rstart_char -> Rstart_char Rid_suffix } Rid_suffix () { -> |+| { /@ Pure finite automata for identifier suffix.\fbreak Capture the index of where the first hyphen is. As length is relative to one, this is its index before i add it to the being built up string. i could have added the character before and then used the length minus one to get its index: ahh the off by one count as the array operator is of course relative to zero. Dave u and your rants...and rolls. Note the 1st entry character into this rule is on the stack which is what the stack frame uses to set the specific symbols. The current token for the parser is now the lookahead character. This is why I reset the parsing character upon entry to the previous token. @/ op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; CAbs_lr1_sym* sym = sf->p1__; parser()->reset_current_token(parser()->current_token_pos()-1);// bk trk to previous chr which is on stack int id; filter_char: id = sym->enumerated_id(); if((id >= NS_yacco2_T_enum::T_Enum::T_raw_a_) && (id <= NS_yacco2_T_enum::T_Enum::T_raw_z_)) goto bld_str; if((id >= NS_yacco2_T_enum::T_Enum::T_raw_A_) && (id <= NS_yacco2_T_enum::T_Enum::T_raw_Z_)) goto bld_str; if((id >= NS_yacco2_T_enum::T_Enum::T_raw_0_) && (id <= NS_yacco2_T_enum::T_Enum::T_raw_9_)) goto bld_str; if(id == NS_yacco2_T_enum::T_Enum::T_raw_under_score_) goto bld_str; if(id == NS_yacco2_T_enum::T_Enum::T_raw_minus_){ if(fsm->hyphen_ == 0){ fsm->hyphen_pos_ = rule_info__.parser__->current_token_pos__-1; fsm->hyphen_ = sym; // note: len rel 1, idx rel 0: "-" not added yet to str, this will be its idx value fsm->hyphen_idx_ = fsm->ddd_idx_; } goto bld_str; } return; // end-of-identifier bld_str: fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; parser()->get_next_token();// as current token sym = parser()->current_token__; goto filter_char; *** } } Rstart_char () { -> RUPPER_A_M -> RUPPER_N_Z -> Rlower_a_m -> Rlower_n_z } RUPPER_A_M ( lhs { op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; size_t pos = rule_info__.parser__->parse_stack__.top_sub__ - 1; CAbs_lr1_sym* sym = rule_info__.parser__->get_spec_stack_token(pos); fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; *** } ){ -> A -> B -> C -> D -> E -> F -> G -> H -> I -> J -> K -> L -> M } RUPPER_N_Z ( lhs { op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; size_t pos = rule_info__.parser__->parse_stack__.top_sub__ - 1; CAbs_lr1_sym* sym = rule_info__.parser__->get_spec_stack_token(pos); fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; *** } ){ -> N -> O -> P -> Q -> "R" // considered a Rule when not quoted! -> S -> T -> U -> V -> W -> X -> Y -> Z } Rlower_a_m ( lhs { op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; size_t pos = rule_info__.parser__->parse_stack__.top_sub__ - 1; CAbs_lr1_sym* sym = rule_info__.parser__->get_spec_stack_token(pos); fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; *** } ){ -> a -> b -> c -> d -> e -> f -> g -> h -> i -> j -> k -> l -> m } Rlower_n_z ( lhs { op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; size_t pos = rule_info__.parser__->parse_stack__.top_sub__ - 1; CAbs_lr1_sym* sym = rule_info__.parser__->get_spec_stack_token(pos); fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; *** } ){ -> n -> o -> p -> q -> r -> s -> t -> u -> v -> w -> x -> y -> z } RNUMBERS ( lhs { op Cidentifier* fsm = (Cidentifier*) rule_info__.parser__->fsm_tbl__; size_t pos = rule_info__.parser__->parse_stack__.top_sub__ - 1; CAbs_lr1_sym* sym = rule_info__.parser__->get_spec_stack_token(pos); fsm->ddd_[fsm->ddd_idx_] = sym->id__[0]; ++fsm->ddd_idx_; fsm->ddd_[fsm->ddd_idx_] = 0; *** } ){ -> 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9 } }// end of rules