LexMarkdown.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. /******************************************************************
  2. * LexMarkdown.cxx
  3. *
  4. * A simple Markdown lexer for scintilla.
  5. *
  6. * Includes highlighting for some extra features from the
  7. * Pandoc implementation; strikeout, using '#.' as a default
  8. * ordered list item marker, and delimited code blocks.
  9. *
  10. * Limitations:
  11. *
  12. * Standard indented code blocks are not highlighted at all,
  13. * as it would conflict with other indentation schemes. Use
  14. * delimited code blocks for blanket highlighting of an
  15. * entire code block. Embedded HTML is not highlighted either.
  16. * Blanket HTML highlighting has issues, because some Markdown
  17. * implementations allow Markdown markup inside of the HTML. Also,
  18. * there is a following blank line issue that can't be ignored,
  19. * explained in the next paragraph. Embedded HTML and code
  20. * blocks would be better supported with language specific
  21. * highlighting.
  22. *
  23. * The highlighting aims to accurately reflect correct syntax,
  24. * but a few restrictions are relaxed. Delimited code blocks are
  25. * highlighted, even if the line following the code block is not blank.
  26. * Requiring a blank line after a block, breaks the highlighting
  27. * in certain cases, because of the way Scintilla ends up calling
  28. * the lexer.
  29. *
  30. * Written by Jon Strait - jstrait@moonloop.net
  31. *
  32. * The License.txt file describes the conditions under which this
  33. * software may be distributed.
  34. *
  35. *****************************************************************/
  36. #include <stdlib.h>
  37. #include <string.h>
  38. #include <stdio.h>
  39. #include <stdarg.h>
  40. #include <assert.h>
  41. #include "ILexer.h"
  42. #include "Scintilla.h"
  43. #include "SciLexer.h"
  44. #include "WordList.h"
  45. #include "LexAccessor.h"
  46. #include "Accessor.h"
  47. #include "StyleContext.h"
  48. #include "CharacterSet.h"
  49. #include "LexerModule.h"
  50. #ifdef SCI_NAMESPACE
  51. using namespace Scintilla;
  52. #endif
  53. static inline bool IsNewline(const int ch) {
  54. return (ch == '\n' || ch == '\r');
  55. }
  56. // True if can follow ch down to the end with possibly trailing whitespace
  57. static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
  58. Sci_PositionU i = 0;
  59. while (sc.GetRelative(++i) == ch)
  60. ;
  61. // Skip over whitespace
  62. while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
  63. ++i;
  64. if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
  65. sc.Forward(i);
  66. sc.ChangeState(state);
  67. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  68. return true;
  69. }
  70. else return false;
  71. }
  72. // Set the state on text section from current to length characters,
  73. // then set the rest until the newline to default, except for any characters matching token
  74. static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
  75. sc.SetState(state);
  76. sc.Forward(length);
  77. sc.SetState(SCE_MARKDOWN_DEFAULT);
  78. sc.Forward();
  79. bool started = false;
  80. while (sc.More() && !IsNewline(sc.ch)) {
  81. if (sc.ch == token && !started) {
  82. sc.SetState(state);
  83. started = true;
  84. }
  85. else if (sc.ch != token) {
  86. sc.SetState(SCE_MARKDOWN_DEFAULT);
  87. started = false;
  88. }
  89. sc.Forward();
  90. }
  91. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  92. }
  93. // Does the previous line have more than spaces and tabs?
  94. static bool HasPrevLineContent(StyleContext &sc) {
  95. Sci_Position i = 0;
  96. // Go back to the previous newline
  97. while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
  98. ;
  99. while ((--i + (Sci_Position)sc.currentPos) >= 0) {
  100. if (IsNewline(sc.GetRelative(i)))
  101. break;
  102. if (!IsASpaceOrTab(sc.GetRelative(i)))
  103. return true;
  104. }
  105. return false;
  106. }
  107. static bool AtTermStart(StyleContext &sc) {
  108. return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
  109. }
  110. static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
  111. int count = 1;
  112. Sci_PositionU i = 0;
  113. for (;;) {
  114. ++i;
  115. int c = sc.GetRelative(i);
  116. if (c == sc.ch)
  117. ++count;
  118. // hit a terminating character
  119. else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
  120. // Are we a valid HRULE
  121. if ((IsNewline(c) || sc.currentPos + i == endPos) &&
  122. count >= 3 && !HasPrevLineContent(sc)) {
  123. sc.SetState(SCE_MARKDOWN_HRULE);
  124. sc.Forward(i);
  125. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  126. return true;
  127. }
  128. else {
  129. sc.SetState(SCE_MARKDOWN_DEFAULT);
  130. return false;
  131. }
  132. }
  133. }
  134. }
  135. static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
  136. WordList **, Accessor &styler) {
  137. Sci_PositionU endPos = startPos + length;
  138. int precharCount = 0;
  139. // Don't advance on a new loop iteration and retry at the same position.
  140. // Useful in the corner case of having to start at the beginning file position
  141. // in the default state.
  142. bool freezeCursor = false;
  143. StyleContext sc(startPos, length, initStyle, styler);
  144. while (sc.More()) {
  145. // Skip past escaped characters
  146. if (sc.ch == '\\') {
  147. sc.Forward();
  148. continue;
  149. }
  150. // A blockquotes resets the line semantics
  151. if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
  152. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  153. // Conditional state-based actions
  154. if (sc.state == SCE_MARKDOWN_CODE2) {
  155. if (sc.Match("``") && sc.GetRelative(-2) != ' ') {
  156. sc.Forward(2);
  157. sc.SetState(SCE_MARKDOWN_DEFAULT);
  158. }
  159. }
  160. else if (sc.state == SCE_MARKDOWN_CODE) {
  161. if (sc.ch == '`' && sc.chPrev != ' ')
  162. sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
  163. }
  164. /* De-activated because it gets in the way of other valid indentation
  165. * schemes, for example multiple paragraphs inside a list item.
  166. // Code block
  167. else if (sc.state == SCE_MARKDOWN_CODEBK) {
  168. bool d = true;
  169. if (IsNewline(sc.ch)) {
  170. if (sc.chNext != '\t') {
  171. for (int c = 1; c < 5; ++c) {
  172. if (sc.GetRelative(c) != ' ')
  173. d = false;
  174. }
  175. }
  176. }
  177. else if (sc.atLineStart) {
  178. if (sc.ch != '\t' ) {
  179. for (int i = 0; i < 4; ++i) {
  180. if (sc.GetRelative(i) != ' ')
  181. d = false;
  182. }
  183. }
  184. }
  185. if (!d)
  186. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  187. }
  188. */
  189. // Strong
  190. else if (sc.state == SCE_MARKDOWN_STRONG1) {
  191. if (sc.Match("**") && sc.chPrev != ' ') {
  192. sc.Forward(2);
  193. sc.SetState(SCE_MARKDOWN_DEFAULT);
  194. }
  195. }
  196. else if (sc.state == SCE_MARKDOWN_STRONG2) {
  197. if (sc.Match("__") && sc.chPrev != ' ') {
  198. sc.Forward(2);
  199. sc.SetState(SCE_MARKDOWN_DEFAULT);
  200. }
  201. }
  202. // Emphasis
  203. else if (sc.state == SCE_MARKDOWN_EM1) {
  204. if (sc.ch == '*' && sc.chPrev != ' ')
  205. sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
  206. }
  207. else if (sc.state == SCE_MARKDOWN_EM2) {
  208. if (sc.ch == '_' && sc.chPrev != ' ')
  209. sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
  210. }
  211. else if (sc.state == SCE_MARKDOWN_CODEBK) {
  212. if (sc.atLineStart && sc.Match("~~~")) {
  213. Sci_Position i = 1;
  214. while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
  215. i++;
  216. sc.Forward(i);
  217. sc.SetState(SCE_MARKDOWN_DEFAULT);
  218. }
  219. }
  220. else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
  221. if (sc.Match("~~") && sc.chPrev != ' ') {
  222. sc.Forward(2);
  223. sc.SetState(SCE_MARKDOWN_DEFAULT);
  224. }
  225. }
  226. else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
  227. // Header
  228. if (sc.Match("######"))
  229. SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
  230. else if (sc.Match("#####"))
  231. SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
  232. else if (sc.Match("####"))
  233. SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
  234. else if (sc.Match("###"))
  235. SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
  236. else if (sc.Match("##"))
  237. SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
  238. else if (sc.Match("#")) {
  239. // Catch the special case of an unordered list
  240. if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
  241. precharCount = 0;
  242. sc.SetState(SCE_MARKDOWN_PRECHAR);
  243. }
  244. else
  245. SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
  246. }
  247. // Code block
  248. else if (sc.Match("~~~")) {
  249. if (!HasPrevLineContent(sc))
  250. sc.SetState(SCE_MARKDOWN_CODEBK);
  251. else
  252. sc.SetState(SCE_MARKDOWN_DEFAULT);
  253. }
  254. else if (sc.ch == '=') {
  255. if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc))
  256. ;
  257. else
  258. sc.SetState(SCE_MARKDOWN_DEFAULT);
  259. }
  260. else if (sc.ch == '-') {
  261. if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc))
  262. ;
  263. else {
  264. precharCount = 0;
  265. sc.SetState(SCE_MARKDOWN_PRECHAR);
  266. }
  267. }
  268. else if (IsNewline(sc.ch))
  269. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  270. else {
  271. precharCount = 0;
  272. sc.SetState(SCE_MARKDOWN_PRECHAR);
  273. }
  274. }
  275. // The header lasts until the newline
  276. else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
  277. sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
  278. sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
  279. if (IsNewline(sc.ch))
  280. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  281. }
  282. // New state only within the initial whitespace
  283. if (sc.state == SCE_MARKDOWN_PRECHAR) {
  284. // Blockquote
  285. if (sc.ch == '>' && precharCount < 5)
  286. sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
  287. /*
  288. // Begin of code block
  289. else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
  290. sc.SetState(SCE_MARKDOWN_CODEBK);
  291. */
  292. // HRule - Total of three or more hyphens, asterisks, or underscores
  293. // on a line by themselves
  294. else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
  295. ;
  296. // Unordered list
  297. else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
  298. sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
  299. sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
  300. }
  301. // Ordered list
  302. else if (IsADigit(sc.ch)) {
  303. int digitCount = 0;
  304. while (IsADigit(sc.GetRelative(++digitCount)))
  305. ;
  306. if (sc.GetRelative(digitCount) == '.' &&
  307. IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
  308. sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
  309. sc.Forward(digitCount + 1);
  310. sc.SetState(SCE_MARKDOWN_DEFAULT);
  311. }
  312. }
  313. // Alternate Ordered list
  314. else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
  315. sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
  316. sc.Forward(2);
  317. sc.SetState(SCE_MARKDOWN_DEFAULT);
  318. }
  319. else if (sc.ch != ' ' || precharCount > 2)
  320. sc.SetState(SCE_MARKDOWN_DEFAULT);
  321. else
  322. ++precharCount;
  323. }
  324. // New state anywhere in doc
  325. if (sc.state == SCE_MARKDOWN_DEFAULT) {
  326. if (sc.atLineStart && sc.ch == '#') {
  327. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  328. freezeCursor = true;
  329. }
  330. // Links and Images
  331. if (sc.Match("![") || sc.ch == '[') {
  332. Sci_Position i = 0, j = 0, k = 0;
  333. Sci_Position len = endPos - sc.currentPos;
  334. while (i < len && (sc.GetRelative(++i) != ']' || sc.GetRelative(i - 1) == '\\'))
  335. ;
  336. if (sc.GetRelative(i) == ']') {
  337. j = i;
  338. if (sc.GetRelative(++i) == '(') {
  339. while (i < len && (sc.GetRelative(++i) != ')' || sc.GetRelative(i - 1) == '\\'))
  340. ;
  341. if (sc.GetRelative(i) == ')')
  342. k = i;
  343. }
  344. else if (sc.GetRelative(i) == '[' || sc.GetRelative(++i) == '[') {
  345. while (i < len && (sc.GetRelative(++i) != ']' || sc.GetRelative(i - 1) == '\\'))
  346. ;
  347. if (sc.GetRelative(i) == ']')
  348. k = i;
  349. }
  350. }
  351. // At least a link text
  352. if (j) {
  353. sc.SetState(SCE_MARKDOWN_LINK);
  354. sc.Forward(j);
  355. // Also has a URL or reference portion
  356. if (k)
  357. sc.Forward(k - j);
  358. sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
  359. }
  360. }
  361. // Code - also a special case for alternate inside spacing
  362. if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
  363. sc.SetState(SCE_MARKDOWN_CODE2);
  364. sc.Forward();
  365. }
  366. else if (sc.ch == '`' && sc.chNext != ' ' && AtTermStart(sc)) {
  367. sc.SetState(SCE_MARKDOWN_CODE);
  368. }
  369. // Strong
  370. else if (sc.Match("**") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
  371. sc.SetState(SCE_MARKDOWN_STRONG1);
  372. sc.Forward();
  373. }
  374. else if (sc.Match("__") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
  375. sc.SetState(SCE_MARKDOWN_STRONG2);
  376. sc.Forward();
  377. }
  378. // Emphasis
  379. else if (sc.ch == '*' && sc.chNext != ' ' && AtTermStart(sc)) {
  380. sc.SetState(SCE_MARKDOWN_EM1);
  381. }
  382. else if (sc.ch == '_' && sc.chNext != ' ' && AtTermStart(sc)) {
  383. sc.SetState(SCE_MARKDOWN_EM2);
  384. }
  385. // Strikeout
  386. else if (sc.Match("~~") && sc.GetRelative(2) != ' ' && AtTermStart(sc)) {
  387. sc.SetState(SCE_MARKDOWN_STRIKEOUT);
  388. sc.Forward();
  389. }
  390. // Beginning of line
  391. else if (IsNewline(sc.ch)) {
  392. sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
  393. }
  394. }
  395. // Advance if not holding back the cursor for this iteration.
  396. if (!freezeCursor)
  397. sc.Forward();
  398. freezeCursor = false;
  399. }
  400. sc.Complete();
  401. }
  402. LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");