Skip to content

Commit 3fa250a

Browse files
committed
Refactor frontend and error diagnostic
This patch completely separate the preprocessor functionality from scanner-less parser to indepadent units, which allows compiler to expand and parse nested function-like macro, multi-token object-like macro, and more. Furthermore, the error diagnostic is rewritten to better allow user to find out where and what lexeme causes compiler to panic.
1 parent c044948 commit 3fa250a

28 files changed

+2502
-1494
lines changed

.github/workflows/main.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,34 @@ jobs:
8888
make check-sanitizer DYNLINK=${{ steps.determine-mode.outputs.DYNLINK }} || exit 1
8989
make check DYNLINK=${{ steps.determine-mode.outputs.DYNLINK }} || exit 1
9090
91+
preprocessor-host:
92+
runs-on: ubuntu-24.04
93+
strategy:
94+
matrix:
95+
compiler: [gcc, clang]
96+
architecture: [arm, riscv]
97+
steps:
98+
- name: Checkout code
99+
uses: actions/checkout@v4
100+
- name: Download dependencies
101+
run: |
102+
sudo apt-get update -q -y
103+
sudo apt-get install -q -y graphviz jq
104+
sudo apt-get install -q -y qemu-user
105+
sudo apt-get install -q -y build-essential
106+
- name: Configurate config
107+
run: |
108+
make distclean config ARCH=${{ matrix.architecture }}
109+
- name: Preprocess stage 1 source code
110+
env:
111+
CC: ${{ matrix.compiler }}
112+
run: |
113+
make out/shecc
114+
./out/shecc -E src/main.c > ./out/out.c
115+
- name: Build stage 1 artifact
116+
run: |
117+
./out/shecc --no-libc -o out/shecc-stage1.elf ./out/out.c
118+
91119
coding-style:
92120
runs-on: ubuntu-24.04
93121
steps:

COMPLIANCE.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@ This document tracks compliance gaps and non-standard behaviors.
3737
- `#define` for object-like and function-like macros
3838
- `#ifdef`, `#ifndef`, `#if`, `#elif`, `#else`, `#endif`
3939
- `#undef` for macro removal
40+
- `#pragma once`, other `#pragma` options will be ignored
4041
- `defined()` operator
4142
- `__VA_ARGS__` for variadic macros
43+
- `__FILE__`, `__LINE__` built-in macros
4244

4345
## Missing Features
4446

@@ -83,15 +85,12 @@ This document tracks compliance gaps and non-standard behaviors.
8385

8486
| Feature | Status | Description |
8587
|---------|--------|-------------|
86-
| `#include` | Parsed only | No file inclusion |
88+
| `#include` | Parsed only | Local file inclusion is supported, but lack of capability too includes system files |
8789
| Token pasting (`##`) | Missing | Cannot concatenate tokens |
8890
| Stringizing (`#`) | Missing | Cannot convert to string |
89-
| `__FILE__` | Missing | No file name macro |
90-
| `__LINE__` | Missing | No line number macro |
9191
| `__DATE__` | Missing | No compile date |
9292
| `__TIME__` | Missing | No compile time |
9393
| `__STDC__` | Missing | No standard compliance indicator |
94-
| `#pragma` | Ignored | Accepted but no effect |
9594

9695
### Advanced Features
9796

lib/c.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,34 @@ int fputc(int c, FILE *stream)
543543
return c;
544544
}
545545

546+
int fseek(FILE *stream, int offset, int whence)
547+
{
548+
#if defined(__arm__)
549+
__syscall(__syscall_lseek, stream, offset, whence);
550+
#elif defined(__riscv)
551+
/* No need to offset */
552+
__syscall(__syscall_lseek, stream, 0, offset, NULL, whence);
553+
#else
554+
#error "Unsupported fseek support for current platform"
555+
#endif
556+
}
557+
558+
int ftell(FILE *stream)
559+
{
560+
#if defined(__arm__)
561+
return __syscall(__syscall_lseek, stream, 0, SEEK_CUR);
562+
#elif defined(__riscv)
563+
int result;
564+
__syscall(__syscall_lseek, stream, 0, 0, &result, SEEK_CUR);
565+
return result;
566+
#else
567+
#error "Unsupported ftell support for current platform"
568+
#endif
569+
}
570+
571+
/* Non-portable: Assume page size is 4KiB */
572+
#define PAGESIZE 4096
573+
546574
#define CHUNK_SIZE_FREED_MASK 1
547575
#define CHUNK_SIZE_SZ_MASK 0xFFFFFFFE
548576
#define CHUNK_GET_SIZE(size) (size & CHUNK_SIZE_SZ_MASK)

lib/c.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@
1717
#define INT_MAX 0x7fffffff
1818
#define INT_MIN 0x80000000
1919

20+
#define SEEK_SET 0
21+
#define SEEK_CUR 1
22+
#define SEEK_END 2
23+
2024
#if defined(__arm__)
2125
#define __SIZEOF_POINTER__ 4
2226
#define __syscall_exit 1
2327
#define __syscall_read 3
2428
#define __syscall_write 4
2529
#define __syscall_close 6
2630
#define __syscall_open 5
31+
#define __syscall_lseek 19
2732
#define __syscall_mmap2 192
2833
#define __syscall_munmap 91
2934

@@ -35,6 +40,7 @@
3540
#define __syscall_close 57
3641
#define __syscall_open 1024
3742
#define __syscall_openat 56
43+
#define __syscall_lseek 62
3844
#define __syscall_mmap2 222
3945
#define __syscall_munmap 215
4046

@@ -59,6 +65,8 @@ int fclose(FILE *stream);
5965
int fgetc(FILE *stream);
6066
char *fgets(char *str, int n, FILE *stream);
6167
int fputc(int c, FILE *stream);
68+
int fseek(FILE *stream, int offset, int whence);
69+
int ftell(FILE *stream);
6270

6371
/* string-related functions */
6472
int strlen(char *str);

src/arm.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ arm_cond_t arm_get_cond(opcode_t op)
104104
case OP_leq:
105105
return __LE;
106106
default:
107-
error("Unsupported condition IR opcode");
107+
fatal("Unsupported condition IR opcode");
108108
}
109109
return __AL;
110110
}
@@ -113,7 +113,7 @@ int arm_extract_bits(int imm, int i_start, int i_end, int d_start, int d_end)
113113
{
114114
if (((d_end - d_start) != (i_end - i_start)) || (i_start > i_end) ||
115115
(d_start > d_end))
116-
error("Invalid bit copy");
116+
fatal("Invalid bit copy");
117117

118118
int v = imm >> i_start;
119119
v &= ((2 << (i_end - i_start)) - 1);
@@ -143,7 +143,7 @@ int __mov(arm_cond_t cond, int io, int opcode, int s, int rn, int rd, int op2)
143143
}
144144
if (op2 > 255)
145145
/* value spans more than 8 bits */
146-
error("Unable to represent value");
146+
fatal("Unable to represent value");
147147
}
148148
return arm_encode(cond, s + (opcode << 1) + (io << 5), rn, rd,
149149
(shift << 8) + (op2 & 255));
@@ -286,7 +286,7 @@ int arm_halfword_transfer(arm_cond_t cond,
286286
}
287287

288288
if (ofs > 255)
289-
error("Halfword offset too large");
289+
fatal("Halfword offset too large");
290290

291291
/* Halfword encoding: split offset into 4-bit high and low parts */
292292
int imm4H = ((ofs >> 4) & 0xF) << 8;

src/defs.h

Lines changed: 33 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@
1010

1111
/* definitions */
1212

13+
/* Common macro functions */
14+
#define is_whitespace(c) (c == ' ' || c == '\t')
15+
#define is_newline(c) (c == '\r' || c == '\n')
16+
#define is_alnum(c) \
17+
((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || \
18+
(c >= '0' && c <= '9') || (c == '_'))
19+
#define is_digit(c) ((c >= '0' && c <= '9'))
20+
#define is_hex(c) \
21+
(is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
22+
1323
/* Limitations */
1424
#define MAX_TOKEN_LEN 256
1525
#define MAX_ID_LEN 64
@@ -26,15 +36,13 @@
2636
#define MAX_BB_DOM_SUCC 64
2737
#define MAX_BB_RDOM_SUCC 256
2838
#define MAX_GLOBAL_IR 256
29-
#define MAX_SOURCE 1048576
3039
#define MAX_CODE 262144
3140
#define MAX_DATA 262144
3241
#define MAX_SYMTAB 65536
3342
#define MAX_STRTAB 65536
3443
#define MAX_HEADER 1024
3544
#define MAX_PROGRAM_HEADER 1024
3645
#define MAX_SECTION 1024
37-
#define MAX_ALIASES 128
3846
#define MAX_SECTION_HEADER 1024
3947
#define MAX_SHSTR 1024
4048
#define MAX_INTERP 1024
@@ -56,7 +64,7 @@
5664
#define SMALL_ARENA_SIZE 65536 /* 64 KiB - for small allocations */
5765
#define LARGE_ARENA_SIZE 524288 /* 512 KiB - for instruction arena */
5866
#define DEFAULT_FUNCS_SIZE 64
59-
#define DEFAULT_INCLUSIONS_SIZE 16
67+
#define DEFAULT_SRC_FILE_COUNT 8
6068

6169
/* Arena compaction bitmask flags for selective memory reclamation */
6270
#define COMPACT_ARENA_BLOCK 0x01 /* BLOCK_ARENA - variables/blocks */
@@ -131,6 +139,7 @@ typedef struct {
131139
/* lexer tokens */
132140
typedef enum {
133141
T_start, /* FIXME: Unused, intended for lexer state machine init */
142+
T_eof, /* end-of-file (EOF) */
134143
T_numeric,
135144
T_identifier,
136145
T_comma, /* , */
@@ -179,7 +188,6 @@ typedef enum {
179188
T_question, /* ? */
180189
T_colon, /* : */
181190
T_semicolon, /* ; */
182-
T_eof, /* end-of-file (EOF) */
183191
T_ampersand, /* & */
184192
T_return,
185193
T_if,
@@ -211,38 +219,36 @@ typedef enum {
211219
T_cppd_endif,
212220
T_cppd_ifdef,
213221
T_cppd_ifndef,
214-
T_cppd_pragma
215-
} token_t;
222+
T_cppd_pragma,
223+
/* C pre-processor specific, these kinds
224+
* will be removed after pre-processing is done.
225+
*/
226+
T_newline,
227+
T_backslash,
228+
T_whitespace,
229+
T_tab
230+
} token_kind_t;
216231

217232
/* Source location tracking for better error reporting */
218233
typedef struct {
234+
int pos; /* raw source file position */
235+
int len; /* length of token */
219236
int line;
220237
int column;
221238
char *filename;
222239
} source_location_t;
223240

224-
/* Token structure with metadata for enhanced lexing */
225-
typedef struct token_info {
226-
token_t type;
227-
char value[MAX_TOKEN_LEN];
241+
typedef struct token {
242+
token_kind_t kind;
243+
char *literal;
228244
source_location_t location;
229-
struct token_info *next; /* For freelist management */
230-
} token_info_t;
231-
232-
/* Token freelist for memory reuse */
233-
typedef struct {
234-
token_info_t *freelist;
235-
int allocated_count;
236-
} token_pool_t;
245+
struct token *next;
246+
} token_t;
237247

238-
/* Token buffer for improved lookahead */
239-
#define TOKEN_BUFFER_SIZE 8
240-
typedef struct {
241-
token_info_t *tokens[TOKEN_BUFFER_SIZE];
242-
int head;
243-
int tail;
244-
int count;
245-
} token_buffer_t;
248+
typedef struct token_stream {
249+
token_t *head;
250+
token_t *tail;
251+
} token_stream_t;
246252

247253
/* String pool for identifier deduplication */
248254
typedef struct {
@@ -387,7 +393,7 @@ struct var {
387393
int in_loop;
388394
struct var *base;
389395
int subscript;
390-
struct var *subscripts[64];
396+
struct var *subscripts[128];
391397
int subscripts_idx;
392398
rename_t rename;
393399
ref_block_list_t ref_block_list; /* blocks which kill variable */
@@ -412,25 +418,13 @@ struct var {
412418
bool ofs_based_on_stack_top;
413419
};
414420

415-
typedef struct {
416-
char name[MAX_VAR_LEN];
417-
bool is_variadic;
418-
int start_source_idx;
419-
var_t param_defs[MAX_PARAMS];
420-
int num_param_defs;
421-
int params[MAX_PARAMS];
422-
int num_params;
423-
bool disabled;
424-
} macro_t;
425-
426421
typedef struct func func_t;
427422

428423
/* block definition */
429424
struct block {
430425
var_list_t locals;
431426
struct block *parent;
432427
func_t *func;
433-
macro_t *macro;
434428
struct block *next;
435429
};
436430

@@ -494,13 +488,6 @@ typedef struct {
494488
type_t *type;
495489
} lvalue_t;
496490

497-
/* alias for #defines */
498-
typedef struct {
499-
char alias[MAX_VAR_LEN];
500-
char value[MAX_VAR_LEN];
501-
bool disabled;
502-
} alias_t;
503-
504491
/* constants for enums */
505492
typedef struct {
506493
char alias[MAX_VAR_LEN];

src/elf.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ void elf_generate_header(void)
6262
{
6363
/* Check for null pointers to prevent crashes */
6464
if (!elf_code || !elf_data || !elf_symtab || !elf_strtab || !elf_header) {
65-
error("ELF buffers not initialized");
65+
fatal("ELF buffers not initialized");
6666
return;
6767
}
6868

@@ -193,7 +193,7 @@ void elf_generate_program_headers(void)
193193
!dynamic_sections.elf_plt || !dynamic_sections.elf_got ||
194194
!dynamic_sections.elf_dynstr || !dynamic_sections.elf_dynsym ||
195195
!dynamic_sections.elf_dynamic))) {
196-
error("ELF section buffers not initialized");
196+
fatal("ELF section buffers not initialized");
197197
return;
198198
}
199199

@@ -316,7 +316,7 @@ void elf_generate_section_headers(void)
316316
!dynamic_sections.elf_plt || !dynamic_sections.elf_got ||
317317
!dynamic_sections.elf_dynstr || !dynamic_sections.elf_dynsym ||
318318
!dynamic_sections.elf_dynamic))) {
319-
error("ELF section buffers not initialized");
319+
fatal("ELF section buffers not initialized");
320320
return;
321321
}
322322

@@ -587,7 +587,7 @@ void elf_align(strbuf_t *elf_array)
587587
{
588588
/* Check for null pointers to prevent crashes */
589589
if (!elf_array) {
590-
error("ELF buffers not initialized for alignment");
590+
fatal("ELF buffers not initialized for alignment");
591591
return;
592592
}
593593

@@ -603,7 +603,7 @@ void elf_generate_sections(void)
603603
!dynamic_sections.elf_plt || !dynamic_sections.elf_got ||
604604
!dynamic_sections.elf_dynstr || !dynamic_sections.elf_dynsym ||
605605
!dynamic_sections.elf_dynamic))) {
606-
error("ELF section buffers not initialized");
606+
fatal("ELF section buffers not initialized");
607607
return;
608608
}
609609

@@ -820,7 +820,7 @@ void elf_add_symbol(const char *symbol, int pc)
820820
{
821821
/* Check for null pointers to prevent crashes */
822822
if (!symbol || !elf_symtab || !elf_strtab) {
823-
error("Invalid parameters for elf_add_symbol");
823+
fatal("Invalid parameters for elf_add_symbol");
824824
return;
825825
}
826826

@@ -928,7 +928,7 @@ void elf_generate(const char *outfile)
928928

929929
FILE *fp = fopen(outfile, "wb");
930930
if (!fp) {
931-
error("Unable to open output file for writing");
931+
fatal("Unable to open output file for writing");
932932
return;
933933
}
934934

0 commit comments

Comments
 (0)