-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
/
Copy pathmcnaughton_yamada_thompson.c
721 lines (626 loc) · 21.2 KB
/
mcnaughton_yamada_thompson.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/**
* @file
* @brief [McNaughton–Yamada–Thompson algorithm](https://en.wikipedia.org/wiki/Thompson%27s_construction)
* @details
* From Wikipedia:
* In computer science, Thompson's construction algorithm,
* also called the McNaughton–Yamada–Thompson algorithm,
* is a method of transforming a regular expression into
* an equivalent nondeterministic finite automaton (NFA).
* This implementation implements the all three operations
* (implicit concatenation, '|' for union, '*' for Kleene star)
* required by the formal definition of regular expressions.
* @author [Sharon Cassidy](https://github.com/CascadingCascade)
*/
#include <assert.h> /// for assert()
#include <stdio.h> /// for IO operations
#include <string.h> /// for string operations
#include <stdlib.h> /// for memory management
/* Begin declarations, I opted to place various helper / utility functions
* close to their usages and didn't split their declaration / definition */
/**
* @brief Definition for a binary abstract syntax tree (AST) node
*/
struct ASTNode {
char content; ///< the content of this node
struct ASTNode* left; ///< left child
struct ASTNode* right; ///< right child
};
struct ASTNode* createNode(char content);
void destroyNode(struct ASTNode* node);
char* preProcessing(const char* input);
struct ASTNode* buildAST(const char* input);
/**
* @brief Definition for a NFA state transition rule
*/
struct transRule {
struct NFAState* target; ///< pointer to the state to transit to
char cond; ///< the input required to activate this transition
};
struct transRule* createRule(struct NFAState* state, char c);
void destroyRule(struct transRule* rule);
/**
* @brief Definition for a NFA state. Each NFAState object is initialized
* to have a capacity of three rules, since there will only be at most two
* outgoing rules and one empty character circular rule in this algorithm
*/
struct NFAState {
int ruleCount; ///< number of transition rules this state have
struct transRule** rules; ///< the transition rules
};
struct NFAState* createState(void);
void destroyState(struct NFAState* state);
/**
* @brief Definition for the NFA itself.
* statePool[0] is defined to be its starting state,
* and statePool[1] is defined to be its accepting state.
* for simplicity's sake all NFAs are initialized to have
* a small fixed capacity, although due to the recursive nature
* of this algorithm this capacity is believed to be sufficient
*/
struct NFA {
int stateCount; ///< the total number of states this NFA have
struct NFAState** statePool; ///< the pool of all available states
int ruleCount; ///< the total number of transition rules in this NFA
struct transRule** rulePool; ///< the pool of all transition rules
int CSCount; ///< the number of currently active states
struct NFAState** currentStates; ///< the pool of all active states
int subCount; ///< the number of sub NFAs
struct NFA** subs; ///< the pool of all sub NFAs
int wrapperFlag; ///< whether this NFA is a concatenation wrapper
};
struct NFA* createNFA(void);
void destroyNFA(struct NFA* nfa);
void addState(struct NFA* nfa, struct NFAState* state);
void addRule(struct NFA* nfa, struct transRule* rule, int loc);
void postProcessing(struct NFA* nfa);
void transit(struct NFA* nfa, char input);
int isAccepting(const struct NFA* nfa);
/* End definitions, begin abstract syntax tree construction */
/**
* @brief helper function to determine whether a character should be
* considered a character literal
* @param ch the character to be tested
* @returns `1` if it is a character literal
* @returns `0` otherwise
*/
int isLiteral(const char ch) {
return !(ch == '(' || ch == ')' || ch == '*' || ch == '\n' || ch == '|');
}
/**
* @brief performs preprocessing on a regex string,
* making all implicit concatenations explicit
* @param input target regex string
* @returns pointer to the processing result
*/
char* preProcessing(const char* input) {
const size_t len = strlen(input);
if(len == 0) {
char* str = malloc(1);
str[0] = '\0';
return str;
}
char* str = malloc(len * 2);
size_t op = 0;
for (size_t i = 0; i < len - 1; ++i) {
char c = input[i];
str[op++] = c;
// one character lookahead
char c1 = input[i + 1];
if( (isLiteral(c) && isLiteral(c1)) ||
(isLiteral(c) && c1 == '(') ||
(c == ')' && c1 == '(') ||
(c == ')' && isLiteral(c1)) ||
(c == '*' && isLiteral(c1)) ||
(c == '*' && c1 == '(')
) {
// '\n' is used to represent concatenation
// in this implementation
str[op++] = '\n';
}
}
str[op++] = input[len - 1];
str[op] = '\0';
return str;
}
/**
* @brief utility function to locate the first occurrence
* of a character in a string while respecting parentheses
* @param str target string
* @param key the character to be located
* @returns the index of its first occurrence, `0` if it could not be found
*/
size_t indexOf(const char* str, char key) {
int depth = 0;
for (size_t i = 0; i < strlen(str); ++i) {
const char c = str[i];
if(depth == 0 && c == key) {
return i;
}
if(c == '(') depth++;
if(c == ')') depth--;
}
// Due to the way this function is intended to be used,
// it's safe to assume the character will not appear as
// the string's first character
// thus `0` is used as the `not found` value
return 0;
}
/**
* @brief utility function to create a subString
* @param str target string
* @param begin starting index, inclusive
* @param end ending index, inclusive
* @returns pointer to the newly created subString
*/
char* subString(const char* str, size_t begin, size_t end) {
char* res = malloc(end - begin + 2);
strncpy(res, str + begin, end - begin + 1);
res[end - begin + 1] = '\0';
return res;
}
/**
* @brief recursively constructs a AST from a preprocessed regex string
* @param input regex
* @returns pointer to the resulting tree
*/
struct ASTNode* buildAST(const char* input) {
struct ASTNode* node = createNode('\0');
node->left = NULL;
node->right = NULL;
const size_t len = strlen(input);
size_t index;
// Empty input
if(len == 0) return node;
// Character literals
if(len == 1) {
node->content = input[0];
return node;
}
// Discard parentheses
if(input[0] == '(' && input[len - 1] == ')') {
char* temp = subString(input, 1, len - 2);
destroyNode(node);
node = buildAST(temp);
free(temp);
return node;
}
// Union
index = indexOf(input, '|');
if(index) {
node->content = '|';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Concatenation
index = indexOf(input, '\n');
if(index) {
node->content = '\n';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Kleene star
// Testing with indexOf() is unnecessary here,
// Since all other possibilities have been exhausted
node->content = '*';
char* temp = subString(input, 0, len - 2);
node->left = buildAST(temp);
node->right = NULL;
free(temp);
return node;
}
/* End AST construction, begins the actual algorithm itself */
/**
* @brief helper function to recursively redirect transition rule targets
* @param nfa target NFA
* @param src the state to redirect away from
* @param dest the state to redirect to
* @returns void
*/
void redirect(struct NFA* nfa, struct NFAState* src, struct NFAState* dest) {
for (int i = 0; i < nfa->subCount; ++i) {
redirect(nfa->subs[i], src, dest);
}
for (int i = 0; i < nfa->ruleCount; ++i) {
struct transRule* rule = nfa->rulePool[i];
if (rule->target == src) {
rule->target = dest;
}
}
}
struct NFA* compileFromAST(struct ASTNode* root) {
struct NFA* nfa = createNFA();
// Empty input
if (root->content == '\0') {
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
// Character literals
if (isLiteral(root->content)) {
addRule(nfa, createRule(nfa->statePool[1], root->content), 0);
return nfa;
}
switch (root->content) {
case '\n': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
// Redirects all rules targeting ln's accepting state to
// target rn's starting state
redirect(ln, ln->statePool[1], rn->statePool[0]);
// Manually creates and initializes a special
// "wrapper" NFA
destroyNFA(nfa);
struct NFA* wrapper = malloc(sizeof(struct NFA));
wrapper->stateCount = 2;
wrapper->statePool = malloc(sizeof(struct NFAState*) * 2);
wrapper->subCount = 0;
wrapper->subs = malloc(sizeof(struct NFA*) * 2);
wrapper->ruleCount = 0;
wrapper->rulePool = malloc(sizeof(struct transRule*) * 3);
wrapper->CSCount = 0;
wrapper->currentStates = malloc(sizeof(struct NFAState*) * 2);
wrapper->wrapperFlag = 1;
wrapper->subs[wrapper->subCount++] = ln;
wrapper->subs[wrapper->subCount++] = rn;
// Maps the wrapper NFA's starting and ending states
// to its sub NFAs
wrapper->statePool[0] = ln->statePool[0];
wrapper->statePool[1] = rn->statePool[1];
return wrapper;
}
case '|': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
nfa->subs[nfa->subCount++] = ln;
nfa->subs[nfa->subCount++] = rn;
// Adds empty character transition rules
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(rn->statePool[0], '\0'), 0);
addRule(rn, createRule(nfa->statePool[1], '\0'), 1);
return nfa;
}
case '*': {
struct NFA* ln = compileFromAST(root->left);
nfa->subs[nfa->subCount++] = ln;
addRule(ln, createRule(ln->statePool[0], '\0'), 1);
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
}
// Fallback, shouldn't happen in normal operation
destroyNFA(nfa);
return NULL;
}
/* Ends the algorithm, begins NFA utility functions*/
/**
* @brief adds a state to a NFA
* @param nfa target NFA
* @param state the NFA state to be added
* @returns void
*/
void addState(struct NFA* nfa, struct NFAState* state) {
nfa->statePool[nfa->stateCount++] = state;
}
/**
* @brief adds a transition rule to a NFA
* @param nfa target NFA
* @param rule the rule to be added
* @param loc which state this rule should be added to
* @returns void
*/
void addRule(struct NFA* nfa, struct transRule* rule, int loc) {
nfa->rulePool[nfa->ruleCount++] = rule;
struct NFAState* state = nfa->statePool[loc];
state->rules[state->ruleCount++] = rule;
}
/**
* @brief performs postprocessing on a compiled NFA,
* add circular empty character transition rules where
* it's needed for the NFA to function correctly
* @param nfa target NFA
* @returns void
*/
void postProcessing(struct NFA* nfa) {
// Since the sub NFA's states and rules are managed
// through their own pools, recursion is necessary
for (int i = 0; i < nfa->subCount; ++i) {
postProcessing(nfa->subs[i]);
}
// If a state does not have any empty character accepting rule,
// we add a rule that circles back to itself
// So this state will be preserved when
// empty characters are inputted
for (int i = 0; i < nfa->stateCount; ++i) {
struct NFAState* pState = nfa->statePool[i];
int f = 0;
for (int j = 0; j < pState->ruleCount; ++j) {
if(pState->rules[j]->cond == '\0') {
f = 1;
break;
}
}
if (!f) {
addRule(nfa, createRule(pState, '\0'), i);
}
}
}
/**
* @brief helper function to determine an element's presence in an array
* @param states target array
* @param len length of the target array
* @param state the element to search for
* @returns `1` if the element is present, `0` otherwise
*/
int contains(struct NFAState** states, int len, struct NFAState* state) {
int f = 0;
for (int i = 0; i < len; ++i) {
if(states[i] == state) {
f = 1;
break;
}
}
return f;
}
/**
* @brief helper function to manage empty character transitions
* @param target target NFA
* @param states pointer to results storage location
* @param sc pointer to results count storage location
* @returns void
*/
void findEmpty(struct NFAState* target, struct NFAState** states, int *sc) {
for (int i = 0; i < target->ruleCount; ++i) {
const struct transRule *pRule = target->rules[i];
if (pRule->cond == '\0' && !contains(states, *sc, pRule->target)) {
states[(*sc)++] = pRule->target;
// the use of `states` and `sc` is necessary
// to sync data across recursion levels
findEmpty(pRule->target, states, sc);
}
}
}
/**
* @brief moves a NFA forward
* @param nfa target NFA
* @param input the character to be fed into the NFA
* @returns void
*/
void transit(struct NFA* nfa, char input) {
struct NFAState** newStates = malloc(sizeof(struct NFAState*) * 10);
int NSCount = 0;
if (input == '\0') {
// In case of empty character input, it's possible for
// a state to transit to another state that's more than
// one rule away, we need to take that into account
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
nfa->CSCount--;
struct NFAState** states = malloc(sizeof(struct NFAState*) * 10);
int sc = 0;
findEmpty(pState, states, &sc);
for (int j = 0; j < sc; ++j) {
if(!contains(newStates,NSCount, states[j])) {
newStates[NSCount++] = states[j];
}
}
free(states);
}
} else {
// Iterates through all current states
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
// Gradually empties the current states pool, so
// it can be refilled
nfa->CSCount--;
// Iterates through rules of this state
for (int j = 0; j < pState->ruleCount; ++j) {
const struct transRule *pRule = pState->rules[j];
if(pRule->cond == input) {
if(!contains(newStates, NSCount, pRule->target)) {
newStates[NSCount++] = pRule->target;
}
}
}
}
}
nfa->CSCount = NSCount;
for (int i = 0; i < NSCount; ++i) {
nfa->currentStates[i] = newStates[i];
}
free(newStates);
}
/**
* @brief determines whether the NFA is currently in its accepting state
* @param nfa target NFA
* @returns `1` if the NFA is in its accepting state
* @returns `0` otherwise
*/
int isAccepting(const struct NFA* nfa) {
for (int i = 0; i < nfa->CSCount; ++i) {
if(nfa->currentStates[i] == nfa->statePool[1]) {
return 1;
}
}
return 0;
}
/* Ends NFA utilities, begins testing function*/
/**
* @brief Testing helper function
* @param regex the regular expression to be used
* @param string the string to match against
* @param expected expected results
* @returns void
*/
void testHelper(const char* regex, const char* string, const int expected) {
char* temp = preProcessing(regex);
struct ASTNode* node = buildAST(temp);
struct NFA* nfa = compileFromAST(node);
postProcessing(nfa);
// reallocates the outermost NFA's current states pool
// because it will actually be used to store all the states
nfa->currentStates = realloc(nfa->currentStates, sizeof(struct NFAState*) * 100);
// Starts the NFA by adding its starting state to the pool
nfa->currentStates[nfa->CSCount++] = nfa->statePool[0];
// feeds empty characters into the NFA before and after
// every normal character
for (size_t i = 0; i < strlen(string); ++i) {
transit(nfa, '\0');
transit(nfa, string[i]);
}
transit(nfa, '\0');
assert(isAccepting(nfa) == expected);
destroyNFA(nfa);
destroyNode(node);
free(temp);
}
/**
* @brief Self-test implementations
* @returns void
*/
static void test(void) {
testHelper("(c|a*b)", "c", 1);
testHelper("(c|a*b)", "aab", 1);
testHelper("(c|a*b)", "ca", 0);
testHelper("(c|a*b)*", "caaab", 1);
testHelper("(c|a*b)*", "caba", 0);
testHelper("", "", 1);
testHelper("", "1", 0);
testHelper("(0|(1(01*(00)*0)*1)*)*","11",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","110",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","1100",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","10000",0);
testHelper("(0|(1(01*(00)*0)*1)*)*","00000",1);
printf("All tests have successfully passed!\n");
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main(void) {
test(); // run self-test implementations
return 0;
}
/* I opted to place these more-or-less boilerplate code and their docs
* at the end of file for better readability */
/**
* @brief creates and initializes a AST node
* @param content data to initializes the node with
* @returns pointer to the newly created node
*/
struct ASTNode* createNode(const char content) {
struct ASTNode* node = malloc(sizeof(struct ASTNode));
node->content = content;
node->left = NULL;
node->right = NULL;
return node;
}
/**
* @brief recursively destroys a AST
* @param node the root node of the tree to be deleted
* @returns void
*/
void destroyNode(struct ASTNode* node) {
if(node->left != NULL) {
destroyNode(node->left);
}
if(node->right != NULL) {
destroyNode(node->right);
}
free(node);
}
/**
* @brief creates and initializes a transition rule
* @param state transition target
* @param c transition condition
* @returns pointer to the newly created rule
*/
struct transRule* createRule(struct NFAState* state, char c) {
struct transRule* rule = malloc(sizeof(struct transRule));
rule->target = state;
rule->cond = c;
return rule;
}
/**
* @brief destroys a transition rule object
* @param rule pointer to the object to be deleted
* @returns void
*/
void destroyRule(struct transRule* rule) {
free(rule);
}
/**
* @brief creates and initializes a NFA state
* @returns pointer to the newly created NFA state
*/
struct NFAState* createState(void) {
struct NFAState* state = malloc(sizeof(struct NFAState));
state->ruleCount = 0;
state->rules = malloc(sizeof(struct transRule*) * 3);
return state;
}
/**
* @brief destroys a NFA state
* @param state pointer to the object to be deleted
* @returns void
*/
void destroyState(struct NFAState* state) {
free(state->rules);
free(state);
}
/**
* @brief creates and initializes a NFA
* @returns pointer to the newly created NFA
*/
struct NFA* createNFA(void) {
struct NFA* nfa = malloc(sizeof(struct NFA));
nfa->stateCount = 0;
nfa->statePool = malloc(sizeof(struct NFAState*) * 5);
nfa->ruleCount = 0;
nfa->rulePool = malloc(sizeof(struct transRule*) * 10);
nfa->CSCount = 0;
nfa->currentStates = malloc(sizeof(struct NFAState*) * 5);
nfa->subCount = 0;
nfa->subs = malloc(sizeof(struct NFA*) * 5);
nfa->wrapperFlag = 0;
addState(nfa, createState());
addState(nfa, createState());
return nfa;
}
/**
* @brief recursively destroys a NFA
* @param nfa pointer to the object to be deleted
* @returns void
*/
void destroyNFA(struct NFA* nfa) {
for (int i = 0; i < nfa->subCount; ++i) {
destroyNFA(nfa->subs[i]);
}
// In case of a wrapper NFA, do not free its states
// because it doesn't really have any states of its own
if (!nfa->wrapperFlag) {
for (int i = 0; i < nfa->stateCount; ++i) {
destroyState(nfa->statePool[i]);
}
}
for (int i = 0; i < nfa->ruleCount; ++i) {
destroyRule(nfa->rulePool[i]);
}
free(nfa->statePool);
free(nfa->currentStates);
free(nfa->rulePool);
free(nfa->subs);
free(nfa);
}