IdentifierUtils.cpp 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. // Copyright (C) 2011, 2012 Google Inc.
  2. //
  3. // This file is part of YouCompleteMe.
  4. //
  5. // YouCompleteMe is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // YouCompleteMe is distributed in the hope that it will be useful,
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU General Public License
  16. // along with YouCompleteMe. If not, see <http://www.gnu.org/licenses/>.
  17. #include "IdentifierUtils.h"
  18. #include "Utils.h"
  19. #include "standard.h"
  20. #include <boost/unordered_map.hpp>
  21. #include <boost/assign/list_of.hpp>
  22. #include <boost/regex.hpp>
  23. #include <boost/algorithm/string/regex.hpp>
  24. namespace YouCompleteMe {
  25. namespace fs = boost::filesystem;
  26. namespace {
  27. const char *const COMMENT_AND_STRING_REGEX =
  28. "//.*?$" // Anything following '//'
  29. "|"
  30. "#.*?$" // Anything following '#'
  31. "|"
  32. "/\\*.*?\\*/" // C-style comments, '/* ... */'
  33. "|"
  34. // Anything inside single quotes, '...', but mind:
  35. // 1. that the starting single quote is not escaped
  36. // 2. the escaped slash (\\)
  37. // 3. the escaped single quote inside the string
  38. // "(?<!\\\\)'(?:\\\\\\\\|\\\\'|.)*?'"
  39. "(?<!\\\\)'(?:\\\\\\\\|\\\\'|.)*?'"
  40. "|"
  41. // Anything inside double quotes, "...", but mind:
  42. // 1. that the starting double quote is not escaped
  43. // 2. the escaped slash (\\)
  44. // 3. the escaped double quote inside the string
  45. "(?<!\\\\)\"(?:\\\\\\\\|\\\\\"|.)*?\"";
  46. const char *const IDENTIFIER_REGEX = "[_a-zA-Z]\\w*";
  47. // For details on the tag format supported, see here for details:
  48. // http://ctags.sourceforge.net/FORMAT
  49. // TL;DR: The only supported format is the one Exuberant Ctags emits.
  50. const char *const TAG_REGEX =
  51. "^([^\\t\\n\\r]+)" // The first field is the identifier
  52. "\\t" // A TAB char is the field separator
  53. // The second field is the path to the file that has the identifier; either
  54. // absolute or relative to the tags file.
  55. "([^\\t\\n\\r]+)"
  56. "\\t.*?" // Non-greedy everything
  57. "language:([^\\t\\n\\r]+)" // We want to capture the language of the file
  58. ".*?$";
  59. // Only used as the equality comparer for the below unordered_map which stores
  60. // const char* pointers and not std::string but needs to hash based on string
  61. // values and not pointer values.
  62. // When passed a const char* this will create a temporary std::string for
  63. // comparison, but it's fast enough for our use case.
  64. struct StringEqualityComparer :
  65. std::binary_function< std::string, std::string, bool > {
  66. bool operator()( const std::string &a, const std::string &b ) const {
  67. return a == b;
  68. }
  69. };
  70. // List of languages Exuberant Ctags supports:
  71. // ctags --list-languages
  72. // To map a language name to a filetype, see this file:
  73. // :e $VIMRUNTIME/filetype.vim
  74. // This is a map of const char* and not std::string to prevent issues with
  75. // static initialization.
  76. const boost::unordered_map < const char *,
  77. const char *,
  78. boost::hash< std::string >,
  79. StringEqualityComparer > LANG_TO_FILETYPE =
  80. boost::assign::map_list_of
  81. ( "Ant" , "ant" )
  82. ( "Asm" , "asm" )
  83. ( "Awk" , "awk" )
  84. ( "Basic" , "basic" )
  85. ( "C++" , "cpp" )
  86. ( "C#" , "cs" )
  87. ( "C" , "c" )
  88. ( "COBOL" , "cobol" )
  89. ( "DosBatch" , "dosbatch" )
  90. ( "Eiffel" , "eiffel" )
  91. ( "Erlang" , "erlang" )
  92. ( "Fortran" , "fortran" )
  93. ( "HTML" , "html" )
  94. ( "Java" , "java" )
  95. ( "JavaScript" , "javascript" )
  96. ( "Lisp" , "lisp" )
  97. ( "Lua" , "lua" )
  98. ( "Make" , "make" )
  99. ( "MatLab" , "matlab" )
  100. ( "OCaml" , "ocaml" )
  101. ( "Pascal" , "pascal" )
  102. ( "Perl" , "perl" )
  103. ( "PHP" , "php" )
  104. ( "Python" , "python" )
  105. ( "REXX" , "rexx" )
  106. ( "Ruby" , "ruby" )
  107. ( "Scheme" , "scheme" )
  108. ( "Sh" , "sh" )
  109. ( "SLang" , "slang" )
  110. ( "SML" , "sml" )
  111. ( "SQL" , "sql" )
  112. ( "Tcl" , "tcl" )
  113. ( "Tex" , "tex" )
  114. ( "Vera" , "vera" )
  115. ( "Verilog" , "verilog" )
  116. ( "VHDL" , "vhdl" )
  117. ( "Vim" , "vim" )
  118. ( "YACC" , "yacc" );
  119. const char *const NOT_FOUND = "YCMFOOBAR_NOT_FOUND";
  120. } // unnamed namespace
  121. std::string RemoveIdentifierFreeText( std::string text ) {
  122. boost::erase_all_regex( text, boost::regex( COMMENT_AND_STRING_REGEX ) );
  123. return text;
  124. }
  125. std::vector< std::string > ExtractIdentifiersFromText(
  126. const std::string &text ) {
  127. std::string::const_iterator start = text.begin();
  128. std::string::const_iterator end = text.end();
  129. boost::smatch matches;
  130. const boost::regex expression( IDENTIFIER_REGEX );
  131. std::vector< std::string > identifiers;
  132. while ( boost::regex_search( start, end, matches, expression ) ) {
  133. identifiers.push_back( matches[ 0 ] );
  134. start = matches[ 0 ].second;
  135. }
  136. return identifiers;
  137. }
  138. FiletypeIdentifierMap ExtractIdentifiersFromTagsFile(
  139. const fs::path &path_to_tag_file ) {
  140. FiletypeIdentifierMap filetype_identifier_map;
  141. std::string tags_file_contents;
  142. try {
  143. tags_file_contents = ReadUtf8File( path_to_tag_file );
  144. } catch ( ... ) {
  145. return filetype_identifier_map;
  146. }
  147. std::string::const_iterator start = tags_file_contents.begin();
  148. std::string::const_iterator end = tags_file_contents.end();
  149. boost::smatch matches;
  150. const boost::regex expression( TAG_REGEX );
  151. const boost::match_flag_type options = boost::match_not_dot_newline;
  152. while ( boost::regex_search( start, end, matches, expression, options ) ) {
  153. start = matches[ 0 ].second;
  154. std::string language( matches[ 3 ] );
  155. std::string filetype = FindWithDefault( LANG_TO_FILETYPE,
  156. language.c_str(),
  157. NOT_FOUND );
  158. if ( filetype == NOT_FOUND )
  159. continue;
  160. std::string identifier( matches[ 1 ] );
  161. fs::path path( matches[ 2 ].str() );
  162. if ( path.is_relative() )
  163. path = path_to_tag_file.parent_path() / path;
  164. filetype_identifier_map[ filetype ][ path.string() ].push_back( identifier );
  165. }
  166. return filetype_identifier_map;
  167. }
  168. } // namespace YouCompleteMe