WordList.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. // Scintilla source code edit control
  2. /** @file WordList.cxx
  3. ** Hold a list of words.
  4. **/
  5. // Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
  6. // The License.txt file describes the conditions under which this software may be distributed.
  7. #include <stdlib.h>
  8. #include <string.h>
  9. #include <stdio.h>
  10. #include <stdarg.h>
  11. #include <ctype.h>
  12. #include <algorithm>
  13. #include "StringCopy.h"
  14. #include "WordList.h"
  15. #ifdef SCI_NAMESPACE
  16. using namespace Scintilla;
  17. #endif
  18. /**
  19. * Creates an array that points into each word in the string and puts \0 terminators
  20. * after each word.
  21. */
  22. static char **ArrayFromWordList(char *wordlist, int *len, bool onlyLineEnds = false) {
  23. int prev = '\n';
  24. int words = 0;
  25. // For rapid determination of whether a character is a separator, build
  26. // a look up table.
  27. bool wordSeparator[256];
  28. for (int i=0; i<256; i++) {
  29. wordSeparator[i] = false;
  30. }
  31. wordSeparator[static_cast<unsigned int>('\r')] = true;
  32. wordSeparator[static_cast<unsigned int>('\n')] = true;
  33. if (!onlyLineEnds) {
  34. wordSeparator[static_cast<unsigned int>(' ')] = true;
  35. wordSeparator[static_cast<unsigned int>('\t')] = true;
  36. }
  37. for (int j = 0; wordlist[j]; j++) {
  38. int curr = static_cast<unsigned char>(wordlist[j]);
  39. if (!wordSeparator[curr] && wordSeparator[prev])
  40. words++;
  41. prev = curr;
  42. }
  43. char **keywords = new char *[words + 1];
  44. int wordsStore = 0;
  45. const size_t slen = strlen(wordlist);
  46. if (words) {
  47. prev = '\0';
  48. for (size_t k = 0; k < slen; k++) {
  49. if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) {
  50. if (!prev) {
  51. keywords[wordsStore] = &wordlist[k];
  52. wordsStore++;
  53. }
  54. } else {
  55. wordlist[k] = '\0';
  56. }
  57. prev = wordlist[k];
  58. }
  59. }
  60. keywords[wordsStore] = &wordlist[slen];
  61. *len = wordsStore;
  62. return keywords;
  63. }
  64. WordList::WordList(bool onlyLineEnds_) :
  65. words(0), list(0), len(0), onlyLineEnds(onlyLineEnds_) {
  66. // Prevent warnings by static analyzers about uninitialized starts.
  67. starts[0] = -1;
  68. }
  69. WordList::~WordList() {
  70. Clear();
  71. }
  72. WordList::operator bool() const {
  73. return len ? true : false;
  74. }
  75. bool WordList::operator!=(const WordList &other) const {
  76. if (len != other.len)
  77. return true;
  78. for (int i=0; i<len; i++) {
  79. if (strcmp(words[i], other.words[i]) != 0)
  80. return true;
  81. }
  82. return false;
  83. }
  84. int WordList::Length() const {
  85. return len;
  86. }
  87. void WordList::Clear() {
  88. if (words) {
  89. delete []list;
  90. delete []words;
  91. }
  92. words = 0;
  93. list = 0;
  94. len = 0;
  95. }
  96. #ifdef _MSC_VER
  97. static bool cmpWords(const char *a, const char *b) {
  98. return strcmp(a, b) < 0;
  99. }
  100. #else
  101. static int cmpWords(const void *a, const void *b) {
  102. return strcmp(*static_cast<const char * const *>(a), *static_cast<const char * const *>(b));
  103. }
  104. static void SortWordList(char **words, unsigned int len) {
  105. qsort(reinterpret_cast<void *>(words), len, sizeof(*words), cmpWords);
  106. }
  107. #endif
  108. void WordList::Set(const char *s) {
  109. Clear();
  110. const size_t lenS = strlen(s) + 1;
  111. list = new char[lenS];
  112. memcpy(list, s, lenS);
  113. words = ArrayFromWordList(list, &len, onlyLineEnds);
  114. #ifdef _MSC_VER
  115. std::sort(words, words + len, cmpWords);
  116. #else
  117. SortWordList(words, len);
  118. #endif
  119. for (unsigned int k = 0; k < ELEMENTS(starts); k++)
  120. starts[k] = -1;
  121. for (int l = len - 1; l >= 0; l--) {
  122. unsigned char indexChar = words[l][0];
  123. starts[indexChar] = l;
  124. }
  125. }
  126. /** Check whether a string is in the list.
  127. * List elements are either exact matches or prefixes.
  128. * Prefix elements start with '^' and match all strings that start with the rest of the element
  129. * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'.
  130. */
  131. bool WordList::InList(const char *s) const {
  132. if (0 == words)
  133. return false;
  134. unsigned char firstChar = s[0];
  135. int j = starts[firstChar];
  136. if (j >= 0) {
  137. while (static_cast<unsigned char>(words[j][0]) == firstChar) {
  138. if (s[1] == words[j][1]) {
  139. const char *a = words[j] + 1;
  140. const char *b = s + 1;
  141. while (*a && *a == *b) {
  142. a++;
  143. b++;
  144. }
  145. if (!*a && !*b)
  146. return true;
  147. }
  148. j++;
  149. }
  150. }
  151. j = starts[static_cast<unsigned int>('^')];
  152. if (j >= 0) {
  153. while (words[j][0] == '^') {
  154. const char *a = words[j] + 1;
  155. const char *b = s;
  156. while (*a && *a == *b) {
  157. a++;
  158. b++;
  159. }
  160. if (!*a)
  161. return true;
  162. j++;
  163. }
  164. }
  165. return false;
  166. }
  167. /** similar to InList, but word s can be a substring of keyword.
  168. * eg. the keyword define is defined as def~ine. This means the word must start
  169. * with def to be a keyword, but also defi, defin and define are valid.
  170. * The marker is ~ in this case.
  171. */
  172. bool WordList::InListAbbreviated(const char *s, const char marker) const {
  173. if (0 == words)
  174. return false;
  175. unsigned char firstChar = s[0];
  176. int j = starts[firstChar];
  177. if (j >= 0) {
  178. while (static_cast<unsigned char>(words[j][0]) == firstChar) {
  179. bool isSubword = false;
  180. int start = 1;
  181. if (words[j][1] == marker) {
  182. isSubword = true;
  183. start++;
  184. }
  185. if (s[1] == words[j][start]) {
  186. const char *a = words[j] + start;
  187. const char *b = s + 1;
  188. while (*a && *a == *b) {
  189. a++;
  190. if (*a == marker) {
  191. isSubword = true;
  192. a++;
  193. }
  194. b++;
  195. }
  196. if ((!*a || isSubword) && !*b)
  197. return true;
  198. }
  199. j++;
  200. }
  201. }
  202. j = starts[static_cast<unsigned int>('^')];
  203. if (j >= 0) {
  204. while (words[j][0] == '^') {
  205. const char *a = words[j] + 1;
  206. const char *b = s;
  207. while (*a && *a == *b) {
  208. a++;
  209. b++;
  210. }
  211. if (!*a)
  212. return true;
  213. j++;
  214. }
  215. }
  216. return false;
  217. }
  218. /** similar to InListAbbreviated, but word s can be a abridged version of a keyword.
  219. * eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of
  220. * "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid.
  221. * Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid.
  222. * The marker is ~ in this case.
  223. * No multiple markers check is done and wont work.
  224. */
  225. bool WordList::InListAbridged(const char *s, const char marker) const {
  226. if (0 == words)
  227. return false;
  228. unsigned char firstChar = s[0];
  229. int j = starts[firstChar];
  230. if (j >= 0) {
  231. while (static_cast<unsigned char>(words[j][0]) == firstChar) {
  232. const char *a = words[j];
  233. const char *b = s;
  234. while (*a && *a == *b) {
  235. a++;
  236. if (*a == marker) {
  237. a++;
  238. const size_t suffixLengthA = strlen(a);
  239. const size_t suffixLengthB = strlen(b);
  240. if (suffixLengthA >= suffixLengthB)
  241. break;
  242. b = b + suffixLengthB - suffixLengthA - 1;
  243. }
  244. b++;
  245. }
  246. if (!*a && !*b)
  247. return true;
  248. j++;
  249. }
  250. }
  251. j = starts[static_cast<unsigned int>(marker)];
  252. if (j >= 0) {
  253. while (words[j][0] == marker) {
  254. const char *a = words[j] + 1;
  255. const char *b = s;
  256. const size_t suffixLengthA = strlen(a);
  257. const size_t suffixLengthB = strlen(b);
  258. if (suffixLengthA > suffixLengthB) {
  259. j++;
  260. continue;
  261. }
  262. b = b + suffixLengthB - suffixLengthA;
  263. while (*a && *a == *b) {
  264. a++;
  265. b++;
  266. }
  267. if (!*a && !*b)
  268. return true;
  269. j++;
  270. }
  271. }
  272. return false;
  273. }
  274. const char *WordList::WordAt(int n) const {
  275. return words[n];
  276. }