cf-lexer.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. /*
  2. * Copyright (c) 2023 Lain Bailey <[email protected]>
  3. *
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #pragma once
  17. #include "lexer.h"
  18. #ifdef __cplusplus
  19. extern "C" {
  20. #endif
  21. EXPORT char *cf_literal_to_str(const char *literal, size_t count);
  22. /* ------------------------------------------------------------------------- */
  23. /*
  24. * A C-family lexer token is defined as:
  25. * 1.) A generic 'name' token. (abc123_def456)
  26. * 2.) A numeric sequence (usually starting with a number)
  27. * 3.) A sequence of generic whitespace defined as spaces and tabs
  28. * 4.) A newline
  29. * 5.) A string or character sequence (surrounded by single or double quotes)
  30. * 6.) A single character of a type not specified above
  31. */
  32. enum cf_token_type {
  33. CFTOKEN_NONE,
  34. CFTOKEN_NAME,
  35. CFTOKEN_NUM,
  36. CFTOKEN_SPACETAB,
  37. CFTOKEN_NEWLINE,
  38. CFTOKEN_STRING,
  39. CFTOKEN_OTHER
  40. };
  41. struct cf_token {
  42. const struct cf_lexer *lex;
  43. struct strref str;
  44. struct strref unmerged_str;
  45. enum cf_token_type type;
  46. };
  47. typedef DARRAY(struct cf_token) cf_token_array_t;
  48. static inline void cf_token_clear(struct cf_token *t)
  49. {
  50. memset(t, 0, sizeof(struct cf_token));
  51. }
  52. static inline void cf_token_copy(struct cf_token *dst, const struct cf_token *src)
  53. {
  54. memcpy(dst, src, sizeof(struct cf_token));
  55. }
  56. static inline void cf_token_add(struct cf_token *dst, const struct cf_token *add)
  57. {
  58. strref_add(&dst->str, &add->str);
  59. strref_add(&dst->unmerged_str, &add->unmerged_str);
  60. }
  61. /* ------------------------------------------------------------------------- */
  62. /*
  63. * The c-family lexer is a base lexer for generating a list of string
  64. * reference tokens to be used with c-style languages.
  65. *
  66. * This base lexer is meant to be used as a stepping stone for an actual
  67. * language lexer/parser.
  68. *
  69. * It reformats the text in the two following ways:
  70. * 1.) Spliced lines (escaped newlines) are merged
  71. * 2.) All comments are converted to a single space
  72. */
  73. struct cf_lexer {
  74. char *file;
  75. struct lexer base_lexer;
  76. char *reformatted, *write_offset;
  77. cf_token_array_t tokens;
  78. bool unexpected_eof; /* unexpected multi-line comment eof */
  79. };
  80. EXPORT void cf_lexer_init(struct cf_lexer *lex);
  81. EXPORT void cf_lexer_free(struct cf_lexer *lex);
  82. static inline struct cf_token *cf_lexer_get_tokens(struct cf_lexer *lex)
  83. {
  84. return lex->tokens.array;
  85. }
  86. EXPORT bool cf_lexer_lex(struct cf_lexer *lex, const char *str, const char *file);
  87. /* ------------------------------------------------------------------------- */
  88. /* c-family preprocessor definition */
  89. struct cf_def {
  90. struct cf_token name;
  91. cf_token_array_t params;
  92. cf_token_array_t tokens;
  93. bool macro;
  94. };
  95. static inline void cf_def_init(struct cf_def *cfd)
  96. {
  97. cf_token_clear(&cfd->name);
  98. da_init(cfd->params);
  99. da_init(cfd->tokens);
  100. cfd->macro = false;
  101. }
  102. static inline void cf_def_addparam(struct cf_def *cfd, struct cf_token *param)
  103. {
  104. da_push_back(cfd->params, param);
  105. }
  106. static inline void cf_def_addtoken(struct cf_def *cfd, struct cf_token *token)
  107. {
  108. da_push_back(cfd->tokens, token);
  109. }
  110. static inline struct cf_token *cf_def_getparam(const struct cf_def *cfd, size_t idx)
  111. {
  112. return cfd->params.array + idx;
  113. }
  114. static inline void cf_def_free(struct cf_def *cfd)
  115. {
  116. cf_token_clear(&cfd->name);
  117. da_free(cfd->params);
  118. da_free(cfd->tokens);
  119. }
  120. /* ------------------------------------------------------------------------- */
  121. /*
  122. * C-family preprocessor
  123. *
  124. * This preprocessor allows for standard c-style preprocessor directives
  125. * to be applied to source text, such as:
  126. *
  127. * + #include
  128. * + #define/#undef
  129. * + #ifdef/#ifndef/#if/#elif/#else/#endif
  130. *
  131. * Still left to implement (TODO):
  132. * + #if/#elif
  133. * + "defined" preprocessor keyword
  134. * + system includes
  135. * + variadic macros
  136. * + custom callbacks (for things like pragma)
  137. * + option to exclude features such as #import, variadic macros, and other
  138. * features for certain language implementations
  139. * + macro parameter string operator #
  140. * + macro parameter token concatenation operator ##
  141. * + restricted macros
  142. */
  143. struct cf_preprocessor {
  144. struct cf_lexer *lex;
  145. struct error_data *ed;
  146. DARRAY(struct cf_def) defines;
  147. DARRAY(char *) sys_include_dirs;
  148. DARRAY(struct cf_lexer) dependencies;
  149. cf_token_array_t tokens;
  150. bool ignore_state;
  151. };
  152. EXPORT void cf_preprocessor_init(struct cf_preprocessor *pp);
  153. EXPORT void cf_preprocessor_free(struct cf_preprocessor *pp);
  154. EXPORT bool cf_preprocess(struct cf_preprocessor *pp, struct cf_lexer *lex, struct error_data *ed);
  155. static inline void cf_preprocessor_add_sys_include_dir(struct cf_preprocessor *pp, const char *include_dir)
  156. {
  157. char *str = bstrdup(include_dir);
  158. if (include_dir)
  159. da_push_back(pp->sys_include_dirs, &str);
  160. }
  161. EXPORT void cf_preprocessor_add_def(struct cf_preprocessor *pp, struct cf_def *def);
  162. EXPORT void cf_preprocessor_remove_def(struct cf_preprocessor *pp, const char *def_name);
  163. static inline struct cf_token *cf_preprocessor_get_tokens(struct cf_preprocessor *pp)
  164. {
  165. return pp->tokens.array;
  166. }
  167. #ifdef __cplusplus
  168. }
  169. #endif