cf-lexer.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /*
  2. * Copyright (c) 2023 Lain Bailey <[email protected]>
  3. *
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #pragma once
  17. #include "lexer.h"
  18. #ifdef __cplusplus
  19. extern "C" {
  20. #endif
  21. EXPORT char *cf_literal_to_str(const char *literal, size_t count);
  22. /* ------------------------------------------------------------------------- */
  23. /*
  24. * A C-family lexer token is defined as:
  25. * 1.) A generic 'name' token. (abc123_def456)
  26. * 2.) A numeric sequence (usually starting with a number)
  27. * 3.) A sequence of generic whitespace defined as spaces and tabs
  28. * 4.) A newline
  29. * 5.) A string or character sequence (surrounded by single or double quotes)
  30. * 6.) A single character of a type not specified above
  31. */
  32. enum cf_token_type {
  33. CFTOKEN_NONE,
  34. CFTOKEN_NAME,
  35. CFTOKEN_NUM,
  36. CFTOKEN_SPACETAB,
  37. CFTOKEN_NEWLINE,
  38. CFTOKEN_STRING,
  39. CFTOKEN_OTHER
  40. };
  41. struct cf_token {
  42. const struct cf_lexer *lex;
  43. struct strref str;
  44. struct strref unmerged_str;
  45. enum cf_token_type type;
  46. };
  47. typedef DARRAY(struct cf_token) cf_token_array_t;
  48. static inline void cf_token_clear(struct cf_token *t)
  49. {
  50. memset(t, 0, sizeof(struct cf_token));
  51. }
  52. static inline void cf_token_copy(struct cf_token *dst,
  53. const struct cf_token *src)
  54. {
  55. memcpy(dst, src, sizeof(struct cf_token));
  56. }
  57. static inline void cf_token_add(struct cf_token *dst,
  58. const struct cf_token *add)
  59. {
  60. strref_add(&dst->str, &add->str);
  61. strref_add(&dst->unmerged_str, &add->unmerged_str);
  62. }
  63. /* ------------------------------------------------------------------------- */
  64. /*
  65. * The c-family lexer is a base lexer for generating a list of string
  66. * reference tokens to be used with c-style languages.
  67. *
  68. * This base lexer is meant to be used as a stepping stone for an actual
  69. * language lexer/parser.
  70. *
  71. * It reformats the text in the two following ways:
  72. * 1.) Spliced lines (escaped newlines) are merged
  73. * 2.) All comments are converted to a single space
  74. */
  75. struct cf_lexer {
  76. char *file;
  77. struct lexer base_lexer;
  78. char *reformatted, *write_offset;
  79. cf_token_array_t tokens;
  80. bool unexpected_eof; /* unexpected multi-line comment eof */
  81. };
  82. EXPORT void cf_lexer_init(struct cf_lexer *lex);
  83. EXPORT void cf_lexer_free(struct cf_lexer *lex);
  84. static inline struct cf_token *cf_lexer_get_tokens(struct cf_lexer *lex)
  85. {
  86. return lex->tokens.array;
  87. }
  88. EXPORT bool cf_lexer_lex(struct cf_lexer *lex, const char *str,
  89. const char *file);
  90. /* ------------------------------------------------------------------------- */
  91. /* c-family preprocessor definition */
  92. struct cf_def {
  93. struct cf_token name;
  94. cf_token_array_t params;
  95. cf_token_array_t tokens;
  96. bool macro;
  97. };
  98. static inline void cf_def_init(struct cf_def *cfd)
  99. {
  100. cf_token_clear(&cfd->name);
  101. da_init(cfd->params);
  102. da_init(cfd->tokens);
  103. cfd->macro = false;
  104. }
  105. static inline void cf_def_addparam(struct cf_def *cfd, struct cf_token *param)
  106. {
  107. da_push_back(cfd->params, param);
  108. }
  109. static inline void cf_def_addtoken(struct cf_def *cfd, struct cf_token *token)
  110. {
  111. da_push_back(cfd->tokens, token);
  112. }
  113. static inline struct cf_token *cf_def_getparam(const struct cf_def *cfd,
  114. size_t idx)
  115. {
  116. return cfd->params.array + idx;
  117. }
  118. static inline void cf_def_free(struct cf_def *cfd)
  119. {
  120. cf_token_clear(&cfd->name);
  121. da_free(cfd->params);
  122. da_free(cfd->tokens);
  123. }
  124. /* ------------------------------------------------------------------------- */
  125. /*
  126. * C-family preprocessor
  127. *
  128. * This preprocessor allows for standard c-style preprocessor directives
  129. * to be applied to source text, such as:
  130. *
  131. * + #include
  132. * + #define/#undef
  133. * + #ifdef/#ifndef/#if/#elif/#else/#endif
  134. *
  135. * Still left to implement (TODO):
  136. * + #if/#elif
  137. * + "defined" preprocessor keyword
  138. * + system includes
  139. * + variadic macros
  140. * + custom callbacks (for things like pragma)
  141. * + option to exclude features such as #import, variadic macros, and other
  142. * features for certain language implementations
  143. * + macro parameter string operator #
  144. * + macro parameter token concatenation operator ##
  145. * + restricted macros
  146. */
  147. struct cf_preprocessor {
  148. struct cf_lexer *lex;
  149. struct error_data *ed;
  150. DARRAY(struct cf_def) defines;
  151. DARRAY(char *) sys_include_dirs;
  152. DARRAY(struct cf_lexer) dependencies;
  153. cf_token_array_t tokens;
  154. bool ignore_state;
  155. };
  156. EXPORT void cf_preprocessor_init(struct cf_preprocessor *pp);
  157. EXPORT void cf_preprocessor_free(struct cf_preprocessor *pp);
  158. EXPORT bool cf_preprocess(struct cf_preprocessor *pp, struct cf_lexer *lex,
  159. struct error_data *ed);
  160. static inline void
  161. cf_preprocessor_add_sys_include_dir(struct cf_preprocessor *pp,
  162. const char *include_dir)
  163. {
  164. char *str = bstrdup(include_dir);
  165. if (include_dir)
  166. da_push_back(pp->sys_include_dirs, &str);
  167. }
  168. EXPORT void cf_preprocessor_add_def(struct cf_preprocessor *pp,
  169. struct cf_def *def);
  170. EXPORT void cf_preprocessor_remove_def(struct cf_preprocessor *pp,
  171. const char *def_name);
  172. static inline struct cf_token *
  173. cf_preprocessor_get_tokens(struct cf_preprocessor *pp)
  174. {
  175. return pp->tokens.array;
  176. }
  177. #ifdef __cplusplus
  178. }
  179. #endif