LexJSON.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. // Scintilla source code edit control
  2. /**
  3. * @file LexJSON.cxx
  4. * @date February 19, 2016
  5. * @brief Lexer for JSON and JSON-LD formats
  6. * @author nkmathew
  7. *
  8. * The License.txt file describes the conditions under which this software may
  9. * be distributed.
  10. *
  11. */
  12. #include <cstdlib>
  13. #include <cassert>
  14. #include <cctype>
  15. #include <cstdio>
  16. #include <string>
  17. #include <vector>
  18. #include <map>
  19. #include "ILexer.h"
  20. #include "Scintilla.h"
  21. #include "SciLexer.h"
  22. #include "WordList.h"
  23. #include "LexAccessor.h"
  24. #include "StyleContext.h"
  25. #include "CharacterSet.h"
  26. #include "LexerModule.h"
  27. #include "OptionSet.h"
  28. #ifdef SCI_NAMESPACE
  29. using namespace Scintilla;
  30. #endif
  31. static const char *const JSONWordListDesc[] = {
  32. "JSON Keywords",
  33. "JSON-LD Keywords",
  34. 0
  35. };
  36. /**
  37. * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
  38. * colon separating the prefix and suffix
  39. *
  40. * https://www.w3.org/TR/json-ld/#dfn-compact-iri
  41. */
  42. struct CompactIRI {
  43. int colonCount;
  44. bool foundInvalidChar;
  45. CharacterSet setCompactIRI;
  46. CompactIRI() {
  47. colonCount = 0;
  48. foundInvalidChar = false;
  49. setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
  50. }
  51. void resetState() {
  52. colonCount = 0;
  53. foundInvalidChar = false;
  54. }
  55. void checkChar(int ch) {
  56. if (ch == ':') {
  57. colonCount++;
  58. } else {
  59. foundInvalidChar |= !setCompactIRI.Contains(ch);
  60. }
  61. }
  62. bool shouldHighlight() const {
  63. return !foundInvalidChar && colonCount == 1;
  64. }
  65. };
  66. /**
  67. * Keeps track of escaped characters in strings as per:
  68. *
  69. * https://tools.ietf.org/html/rfc7159#section-7
  70. */
  71. struct EscapeSequence {
  72. int digitsLeft;
  73. CharacterSet setHexDigits;
  74. CharacterSet setEscapeChars;
  75. EscapeSequence() {
  76. digitsLeft = 0;
  77. setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
  78. setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
  79. }
  80. // Returns true if the following character is a valid escaped character
  81. bool newSequence(int nextChar) {
  82. digitsLeft = 0;
  83. if (nextChar == 'u') {
  84. digitsLeft = 5;
  85. } else if (!setEscapeChars.Contains(nextChar)) {
  86. return false;
  87. }
  88. return true;
  89. }
  90. bool atEscapeEnd() const {
  91. return digitsLeft <= 0;
  92. }
  93. bool isInvalidChar(int currChar) const {
  94. return !setHexDigits.Contains(currChar);
  95. }
  96. };
  97. struct OptionsJSON {
  98. bool foldCompact;
  99. bool fold;
  100. bool allowComments;
  101. bool escapeSequence;
  102. OptionsJSON() {
  103. foldCompact = false;
  104. fold = false;
  105. allowComments = false;
  106. escapeSequence = false;
  107. }
  108. };
  109. struct OptionSetJSON : public OptionSet<OptionsJSON> {
  110. OptionSetJSON() {
  111. DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
  112. "Set to 1 to enable highlighting of escape sequences in strings");
  113. DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
  114. "Set to 1 to enable highlighting of line/block comments in JSON");
  115. DefineProperty("fold.compact", &OptionsJSON::foldCompact);
  116. DefineProperty("fold", &OptionsJSON::fold);
  117. DefineWordListSets(JSONWordListDesc);
  118. }
  119. };
  120. class LexerJSON : public ILexer {
  121. OptionsJSON options;
  122. OptionSetJSON optSetJSON;
  123. EscapeSequence escapeSeq;
  124. WordList keywordsJSON;
  125. WordList keywordsJSONLD;
  126. CharacterSet setOperators;
  127. CharacterSet setURL;
  128. CharacterSet setKeywordJSONLD;
  129. CharacterSet setKeywordJSON;
  130. CompactIRI compactIRI;
  131. static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
  132. Sci_Position i = 0;
  133. while (i < 50) {
  134. i++;
  135. char curr = styler.SafeGetCharAt(start+i, '\0');
  136. char next = styler.SafeGetCharAt(start+i+1, '\0');
  137. bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
  138. if (curr == ch) {
  139. return true;
  140. } else if (!isspacechar(curr) || atEOL) {
  141. return false;
  142. }
  143. }
  144. return false;
  145. }
  146. /**
  147. * Looks for the colon following the end quote
  148. *
  149. * Assumes property names of lengths no longer than a 100 characters.
  150. * The colon is also expected to be less than 50 spaces after the end
  151. * quote for the string to be considered a property name
  152. */
  153. static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
  154. Sci_Position i = 0;
  155. bool escaped = false;
  156. while (i < 100) {
  157. i++;
  158. char curr = styler.SafeGetCharAt(start+i, '\0');
  159. if (escaped) {
  160. escaped = false;
  161. continue;
  162. }
  163. escaped = curr == '\\';
  164. if (curr == '"') {
  165. return IsNextNonWhitespace(styler, start+i, ':');
  166. } else if (!curr) {
  167. return false;
  168. }
  169. }
  170. return false;
  171. }
  172. static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
  173. StyleContext &context, LexAccessor &styler) {
  174. char word[51];
  175. Sci_Position currPos = (Sci_Position) context.currentPos;
  176. int i = 0;
  177. while (i < 50) {
  178. char ch = styler.SafeGetCharAt(currPos + i);
  179. if (!wordSet.Contains(ch)) {
  180. break;
  181. }
  182. word[i] = ch;
  183. i++;
  184. }
  185. word[i] = '\0';
  186. return keywordList.InList(word);
  187. }
  188. public:
  189. LexerJSON() :
  190. setOperators(CharacterSet::setNone, "[{}]:,"),
  191. setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
  192. setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
  193. setKeywordJSON(CharacterSet::setAlpha, "$_") {
  194. }
  195. virtual ~LexerJSON() {}
  196. virtual int SCI_METHOD Version() const {
  197. return lvOriginal;
  198. }
  199. virtual void SCI_METHOD Release() {
  200. delete this;
  201. }
  202. virtual const char *SCI_METHOD PropertyNames() {
  203. return optSetJSON.PropertyNames();
  204. }
  205. virtual int SCI_METHOD PropertyType(const char *name) {
  206. return optSetJSON.PropertyType(name);
  207. }
  208. virtual const char *SCI_METHOD DescribeProperty(const char *name) {
  209. return optSetJSON.DescribeProperty(name);
  210. }
  211. virtual Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) {
  212. if (optSetJSON.PropertySet(&options, key, val)) {
  213. return 0;
  214. }
  215. return -1;
  216. }
  217. virtual Sci_Position SCI_METHOD WordListSet(int n, const char *wl) {
  218. WordList *wordListN = 0;
  219. switch (n) {
  220. case 0:
  221. wordListN = &keywordsJSON;
  222. break;
  223. case 1:
  224. wordListN = &keywordsJSONLD;
  225. break;
  226. }
  227. Sci_Position firstModification = -1;
  228. if (wordListN) {
  229. WordList wlNew;
  230. wlNew.Set(wl);
  231. if (*wordListN != wlNew) {
  232. wordListN->Set(wl);
  233. firstModification = 0;
  234. }
  235. }
  236. return firstModification;
  237. }
  238. virtual void *SCI_METHOD PrivateCall(int, void *) {
  239. return 0;
  240. }
  241. static ILexer *LexerFactoryJSON() {
  242. return new LexerJSON;
  243. }
  244. virtual const char *SCI_METHOD DescribeWordListSets() {
  245. return optSetJSON.DescribeWordListSets();
  246. }
  247. virtual void SCI_METHOD Lex(Sci_PositionU startPos,
  248. Sci_Position length,
  249. int initStyle,
  250. IDocument *pAccess);
  251. virtual void SCI_METHOD Fold(Sci_PositionU startPos,
  252. Sci_Position length,
  253. int initStyle,
  254. IDocument *pAccess);
  255. };
  256. void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
  257. Sci_Position length,
  258. int initStyle,
  259. IDocument *pAccess) {
  260. LexAccessor styler(pAccess);
  261. StyleContext context(startPos, length, initStyle, styler);
  262. int stringStyleBefore = SCE_JSON_STRING;
  263. while (context.More()) {
  264. switch (context.state) {
  265. case SCE_JSON_BLOCKCOMMENT:
  266. if (context.Match("*/")) {
  267. context.Forward();
  268. context.ForwardSetState(SCE_JSON_DEFAULT);
  269. }
  270. break;
  271. case SCE_JSON_LINECOMMENT:
  272. if (context.atLineEnd) {
  273. context.SetState(SCE_JSON_DEFAULT);
  274. }
  275. break;
  276. case SCE_JSON_STRINGEOL:
  277. if (context.atLineStart) {
  278. context.SetState(SCE_JSON_DEFAULT);
  279. }
  280. break;
  281. case SCE_JSON_ESCAPESEQUENCE:
  282. escapeSeq.digitsLeft--;
  283. if (!escapeSeq.atEscapeEnd()) {
  284. if (escapeSeq.isInvalidChar(context.ch)) {
  285. context.SetState(SCE_JSON_ERROR);
  286. }
  287. break;
  288. }
  289. if (context.ch == '"') {
  290. context.SetState(stringStyleBefore);
  291. context.ForwardSetState(SCE_C_DEFAULT);
  292. } else if (context.ch == '\\') {
  293. if (!escapeSeq.newSequence(context.chNext)) {
  294. context.SetState(SCE_JSON_ERROR);
  295. }
  296. context.Forward();
  297. } else {
  298. context.SetState(stringStyleBefore);
  299. if (context.atLineEnd) {
  300. context.ChangeState(SCE_JSON_STRINGEOL);
  301. }
  302. }
  303. break;
  304. case SCE_JSON_PROPERTYNAME:
  305. case SCE_JSON_STRING:
  306. if (context.ch == '"') {
  307. if (compactIRI.shouldHighlight()) {
  308. context.ChangeState(SCE_JSON_COMPACTIRI);
  309. context.ForwardSetState(SCE_JSON_DEFAULT);
  310. compactIRI.resetState();
  311. } else {
  312. context.ForwardSetState(SCE_JSON_DEFAULT);
  313. }
  314. } else if (context.atLineEnd) {
  315. context.ChangeState(SCE_JSON_STRINGEOL);
  316. } else if (context.ch == '\\') {
  317. stringStyleBefore = context.state;
  318. if (options.escapeSequence) {
  319. context.SetState(SCE_JSON_ESCAPESEQUENCE);
  320. if (!escapeSeq.newSequence(context.chNext)) {
  321. context.SetState(SCE_JSON_ERROR);
  322. }
  323. }
  324. context.Forward();
  325. } else if (context.Match("https://") ||
  326. context.Match("http://") ||
  327. context.Match("ssh://") ||
  328. context.Match("git://") ||
  329. context.Match("svn://") ||
  330. context.Match("ftp://") ||
  331. context.Match("mailto:")) {
  332. // Handle most common URI schemes only
  333. stringStyleBefore = context.state;
  334. context.SetState(SCE_JSON_URI);
  335. } else if (context.ch == '@') {
  336. // https://www.w3.org/TR/json-ld/#dfn-keyword
  337. if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
  338. stringStyleBefore = context.state;
  339. context.SetState(SCE_JSON_LDKEYWORD);
  340. }
  341. } else {
  342. compactIRI.checkChar(context.ch);
  343. }
  344. break;
  345. case SCE_JSON_LDKEYWORD:
  346. case SCE_JSON_URI:
  347. if ((!setKeywordJSONLD.Contains(context.ch) &&
  348. (context.state == SCE_JSON_LDKEYWORD)) ||
  349. (!setURL.Contains(context.ch))) {
  350. context.SetState(stringStyleBefore);
  351. }
  352. if (context.ch == '"') {
  353. context.ForwardSetState(SCE_JSON_DEFAULT);
  354. } else if (context.atLineEnd) {
  355. context.ChangeState(SCE_JSON_STRINGEOL);
  356. }
  357. break;
  358. case SCE_JSON_OPERATOR:
  359. case SCE_JSON_NUMBER:
  360. context.SetState(SCE_JSON_DEFAULT);
  361. break;
  362. case SCE_JSON_ERROR:
  363. if (context.atLineEnd) {
  364. context.SetState(SCE_JSON_DEFAULT);
  365. }
  366. break;
  367. case SCE_JSON_KEYWORD:
  368. if (!setKeywordJSON.Contains(context.ch)) {
  369. context.SetState(SCE_JSON_DEFAULT);
  370. }
  371. break;
  372. }
  373. if (context.state == SCE_JSON_DEFAULT) {
  374. if (context.ch == '"') {
  375. compactIRI.resetState();
  376. context.SetState(SCE_JSON_STRING);
  377. Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
  378. if (AtPropertyName(styler, currPos)) {
  379. context.SetState(SCE_JSON_PROPERTYNAME);
  380. }
  381. } else if (setOperators.Contains(context.ch)) {
  382. context.SetState(SCE_JSON_OPERATOR);
  383. } else if (options.allowComments && context.Match("/*")) {
  384. context.SetState(SCE_JSON_BLOCKCOMMENT);
  385. context.Forward();
  386. } else if (options.allowComments && context.Match("//")) {
  387. context.SetState(SCE_JSON_LINECOMMENT);
  388. } else if (setKeywordJSON.Contains(context.ch)) {
  389. if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
  390. context.SetState(SCE_JSON_KEYWORD);
  391. }
  392. }
  393. bool numberStart =
  394. IsADigit(context.ch) && (context.chPrev == '+'||
  395. context.chPrev == '-' ||
  396. context.atLineStart ||
  397. IsASpace(context.chPrev) ||
  398. setOperators.Contains(context.chPrev));
  399. bool exponentPart =
  400. tolower(context.ch) == 'e' &&
  401. IsADigit(context.chPrev) &&
  402. (IsADigit(context.chNext) ||
  403. context.chNext == '+' ||
  404. context.chNext == '-');
  405. bool signPart =
  406. (context.ch == '-' || context.ch == '+') &&
  407. ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
  408. ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
  409. && IsADigit(context.chNext)));
  410. bool adjacentDigit =
  411. IsADigit(context.ch) && IsADigit(context.chPrev);
  412. bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
  413. bool dotPart = context.ch == '.' &&
  414. IsADigit(context.chPrev) &&
  415. IsADigit(context.chNext);
  416. bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
  417. if (numberStart ||
  418. exponentPart ||
  419. signPart ||
  420. adjacentDigit ||
  421. dotPart ||
  422. afterExponent ||
  423. afterDot) {
  424. context.SetState(SCE_JSON_NUMBER);
  425. } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
  426. context.SetState(SCE_JSON_ERROR);
  427. }
  428. }
  429. context.Forward();
  430. }
  431. context.Complete();
  432. }
  433. void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
  434. Sci_Position length,
  435. int,
  436. IDocument *pAccess) {
  437. if (!options.fold) {
  438. return;
  439. }
  440. LexAccessor styler(pAccess);
  441. Sci_PositionU currLine = styler.GetLine(startPos);
  442. Sci_PositionU endPos = startPos + length;
  443. int currLevel = SC_FOLDLEVELBASE;
  444. if (currLine > 0)
  445. currLevel = styler.LevelAt(currLine - 1) >> 16;
  446. int nextLevel = currLevel;
  447. int visibleChars = 0;
  448. for (Sci_PositionU i = startPos; i < endPos; i++) {
  449. char curr = styler.SafeGetCharAt(i);
  450. char next = styler.SafeGetCharAt(i+1);
  451. bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
  452. if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
  453. if (curr == '{' || curr == '[') {
  454. nextLevel++;
  455. } else if (curr == '}' || curr == ']') {
  456. nextLevel--;
  457. }
  458. }
  459. if (atEOL || i == (endPos-1)) {
  460. int level = currLevel | nextLevel << 16;
  461. if (!visibleChars && options.foldCompact) {
  462. level |= SC_FOLDLEVELWHITEFLAG;
  463. } else if (nextLevel > currLevel) {
  464. level |= SC_FOLDLEVELHEADERFLAG;
  465. }
  466. if (level != styler.LevelAt(currLine)) {
  467. styler.SetLevel(currLine, level);
  468. }
  469. currLine++;
  470. currLevel = nextLevel;
  471. visibleChars = 0;
  472. }
  473. if (!isspacechar(curr)) {
  474. visibleChars++;
  475. }
  476. }
  477. }
  478. LexerModule lmJSON(SCLEX_JSON,
  479. LexerJSON::LexerFactoryJSON,
  480. "json",
  481. JSONWordListDesc);