diff options
Diffstat (limited to 'src/regex/regex_internal.c')
-rw-r--r-- | src/regex/regex_internal.c | 3718 |
1 files changed, 0 insertions, 3718 deletions
diff --git a/src/regex/regex_internal.c b/src/regex/regex_internal.c deleted file mode 100644 index aa40851a9..000000000 --- a/src/regex/regex_internal.c +++ /dev/null | |||
@@ -1,3718 +0,0 @@ | |||
1 | /* | ||
2 | This file is part of GNUnet | ||
3 | Copyright (C) 2012 GNUnet e.V. | ||
4 | |||
5 | GNUnet is free software: you can redistribute it and/or modify it | ||
6 | under the terms of the GNU Affero General Public License as published | ||
7 | by the Free Software Foundation, either version 3 of the License, | ||
8 | or (at your option) any later version. | ||
9 | |||
10 | GNUnet is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | Affero General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU Affero General Public License | ||
16 | along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
17 | |||
18 | SPDX-License-Identifier: AGPL3.0-or-later | ||
19 | */ | ||
20 | /** | ||
21 | * @file src/regex/regex_internal.c | ||
22 | * @brief library to create Deterministic Finite Automatons (DFAs) from regular | ||
23 | * expressions (regexes). | ||
24 | * @author Maximilian Szengel | ||
25 | */ | ||
26 | #include "platform.h" | ||
27 | #include "gnunet_util_lib.h" | ||
28 | #include "gnunet_regex_service.h" | ||
29 | #include "regex_internal_lib.h" | ||
30 | #include "regex_internal.h" | ||
31 | |||
32 | |||
33 | /** | ||
34 | * Set this to #GNUNET_YES to enable state naming. Used to debug NFA->DFA | ||
35 | * creation. Disabled by default for better performance. | ||
36 | */ | ||
37 | #define REGEX_DEBUG_DFA GNUNET_NO | ||
38 | |||
39 | /** | ||
40 | * Set of states using MDLL API. | ||
41 | */ | ||
42 | struct REGEX_INTERNAL_StateSet_MDLL | ||
43 | { | ||
44 | /** | ||
45 | * MDLL of states. | ||
46 | */ | ||
47 | struct REGEX_INTERNAL_State *head; | ||
48 | |||
49 | /** | ||
50 | * MDLL of states. | ||
51 | */ | ||
52 | struct REGEX_INTERNAL_State *tail; | ||
53 | |||
54 | /** | ||
55 | * Length of the MDLL. | ||
56 | */ | ||
57 | unsigned int len; | ||
58 | }; | ||
59 | |||
60 | |||
61 | /** | ||
62 | * Append state to the given StateSet. | ||
63 | * | ||
64 | * @param set set to be modified | ||
65 | * @param state state to be appended | ||
66 | */ | ||
67 | static void | ||
68 | state_set_append (struct REGEX_INTERNAL_StateSet *set, | ||
69 | struct REGEX_INTERNAL_State *state) | ||
70 | { | ||
71 | if (set->off == set->size) | ||
72 | GNUNET_array_grow (set->states, set->size, set->size * 2 + 4); | ||
73 | set->states[set->off++] = state; | ||
74 | } | ||
75 | |||
76 | |||
77 | /** | ||
78 | * Compare two strings for equality. If either is NULL they are not equal. | ||
79 | * | ||
80 | * @param str1 first string for comparison. | ||
81 | * @param str2 second string for comparison. | ||
82 | * | ||
83 | * @return 0 if the strings are the same or both NULL, 1 or -1 if not. | ||
84 | */ | ||
85 | static int | ||
86 | nullstrcmp (const char *str1, const char *str2) | ||
87 | { | ||
88 | if ((NULL == str1) != (NULL == str2)) | ||
89 | return -1; | ||
90 | if ((NULL == str1) && (NULL == str2)) | ||
91 | return 0; | ||
92 | |||
93 | return strcmp (str1, str2); | ||
94 | } | ||
95 | |||
96 | |||
97 | /** | ||
98 | * Adds a transition from one state to another on @a label. Does not add | ||
99 | * duplicate states. | ||
100 | * | ||
101 | * @param ctx context | ||
102 | * @param from_state starting state for the transition | ||
103 | * @param label transition label | ||
104 | * @param to_state state to where the transition should point to | ||
105 | */ | ||
106 | static void | ||
107 | state_add_transition (struct REGEX_INTERNAL_Context *ctx, | ||
108 | struct REGEX_INTERNAL_State *from_state, | ||
109 | const char *label, | ||
110 | struct REGEX_INTERNAL_State *to_state) | ||
111 | { | ||
112 | struct REGEX_INTERNAL_Transition *t; | ||
113 | struct REGEX_INTERNAL_Transition *oth; | ||
114 | |||
115 | if (NULL == from_state) | ||
116 | { | ||
117 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not create Transition.\n"); | ||
118 | return; | ||
119 | } | ||
120 | |||
121 | /* Do not add duplicate state transitions */ | ||
122 | for (t = from_state->transitions_head; NULL != t; t = t->next) | ||
123 | { | ||
124 | if ((t->to_state == to_state) && (0 == nullstrcmp (t->label, label)) && | ||
125 | (t->from_state == from_state) ) | ||
126 | return; | ||
127 | } | ||
128 | |||
129 | /* sort transitions by label */ | ||
130 | for (oth = from_state->transitions_head; NULL != oth; oth = oth->next) | ||
131 | { | ||
132 | if (0 < nullstrcmp (oth->label, label)) | ||
133 | break; | ||
134 | } | ||
135 | |||
136 | t = GNUNET_new (struct REGEX_INTERNAL_Transition); | ||
137 | if (NULL != ctx) | ||
138 | t->id = ctx->transition_id++; | ||
139 | if (NULL != label) | ||
140 | t->label = GNUNET_strdup (label); | ||
141 | else | ||
142 | t->label = NULL; | ||
143 | t->to_state = to_state; | ||
144 | t->from_state = from_state; | ||
145 | |||
146 | /* Add outgoing transition to 'from_state' */ | ||
147 | from_state->transition_count++; | ||
148 | GNUNET_CONTAINER_DLL_insert_before (from_state->transitions_head, | ||
149 | from_state->transitions_tail, | ||
150 | oth, | ||
151 | t); | ||
152 | } | ||
153 | |||
154 | |||
155 | /** | ||
156 | * Remove a 'transition' from 'state'. | ||
157 | * | ||
158 | * @param state state from which the to-be-removed transition originates. | ||
159 | * @param transition transition that should be removed from state 'state'. | ||
160 | */ | ||
161 | static void | ||
162 | state_remove_transition (struct REGEX_INTERNAL_State *state, | ||
163 | struct REGEX_INTERNAL_Transition *transition) | ||
164 | { | ||
165 | if ((NULL == state) || (NULL == transition)) | ||
166 | return; | ||
167 | |||
168 | if (transition->from_state != state) | ||
169 | return; | ||
170 | |||
171 | GNUNET_free (transition->label); | ||
172 | |||
173 | state->transition_count--; | ||
174 | GNUNET_CONTAINER_DLL_remove (state->transitions_head, | ||
175 | state->transitions_tail, | ||
176 | transition); | ||
177 | |||
178 | GNUNET_free (transition); | ||
179 | } | ||
180 | |||
181 | |||
182 | /** | ||
183 | * Compare two states. Used for sorting. | ||
184 | * | ||
185 | * @param a first state | ||
186 | * @param b second state | ||
187 | * | ||
188 | * @return an integer less than, equal to, or greater than zero | ||
189 | * if the first argument is considered to be respectively | ||
190 | * less than, equal to, or greater than the second. | ||
191 | */ | ||
192 | static int | ||
193 | state_compare (const void *a, const void *b) | ||
194 | { | ||
195 | struct REGEX_INTERNAL_State **s1 = (struct REGEX_INTERNAL_State **) a; | ||
196 | struct REGEX_INTERNAL_State **s2 = (struct REGEX_INTERNAL_State **) b; | ||
197 | |||
198 | return (*s1)->id - (*s2)->id; | ||
199 | } | ||
200 | |||
201 | |||
202 | /** | ||
203 | * Get all edges leaving state @a s. | ||
204 | * | ||
205 | * @param s state. | ||
206 | * @param edges all edges leaving @a s, expected to be allocated and have enough | ||
207 | * space for `s->transitions_count` elements. | ||
208 | * | ||
209 | * @return number of edges. | ||
210 | */ | ||
211 | static unsigned int | ||
212 | state_get_edges (struct REGEX_INTERNAL_State *s, struct REGEX_BLOCK_Edge *edges) | ||
213 | { | ||
214 | struct REGEX_INTERNAL_Transition *t; | ||
215 | unsigned int count; | ||
216 | |||
217 | if (NULL == s) | ||
218 | return 0; | ||
219 | |||
220 | count = 0; | ||
221 | |||
222 | for (t = s->transitions_head; NULL != t; t = t->next) | ||
223 | { | ||
224 | if (NULL != t->to_state) | ||
225 | { | ||
226 | edges[count].label = t->label; | ||
227 | edges[count].destination = t->to_state->hash; | ||
228 | count++; | ||
229 | } | ||
230 | } | ||
231 | return count; | ||
232 | } | ||
233 | |||
234 | |||
235 | /** | ||
236 | * Compare to state sets by comparing the id's of the states that are contained | ||
237 | * in each set. Both sets are expected to be sorted by id! | ||
238 | * | ||
239 | * @param sset1 first state set | ||
240 | * @param sset2 second state set | ||
241 | * @return 0 if the sets are equal, otherwise non-zero | ||
242 | */ | ||
243 | static int | ||
244 | state_set_compare (struct REGEX_INTERNAL_StateSet *sset1, | ||
245 | struct REGEX_INTERNAL_StateSet *sset2) | ||
246 | { | ||
247 | int result; | ||
248 | unsigned int i; | ||
249 | |||
250 | if ((NULL == sset1) || (NULL == sset2)) | ||
251 | return 1; | ||
252 | |||
253 | result = sset1->off - sset2->off; | ||
254 | if (result < 0) | ||
255 | return -1; | ||
256 | if (result > 0) | ||
257 | return 1; | ||
258 | for (i = 0; i < sset1->off; i++) | ||
259 | if (0 != (result = state_compare (&sset1->states[i], &sset2->states[i]))) | ||
260 | break; | ||
261 | return result; | ||
262 | } | ||
263 | |||
264 | |||
265 | /** | ||
266 | * Clears the given StateSet 'set' | ||
267 | * | ||
268 | * @param set set to be cleared | ||
269 | */ | ||
270 | static void | ||
271 | state_set_clear (struct REGEX_INTERNAL_StateSet *set) | ||
272 | { | ||
273 | GNUNET_array_grow (set->states, set->size, 0); | ||
274 | set->off = 0; | ||
275 | } | ||
276 | |||
277 | |||
278 | /** | ||
279 | * Clears an automaton fragment. Does not destroy the states inside the | ||
280 | * automaton. | ||
281 | * | ||
282 | * @param a automaton to be cleared | ||
283 | */ | ||
284 | static void | ||
285 | automaton_fragment_clear (struct REGEX_INTERNAL_Automaton *a) | ||
286 | { | ||
287 | if (NULL == a) | ||
288 | return; | ||
289 | |||
290 | a->start = NULL; | ||
291 | a->end = NULL; | ||
292 | a->states_head = NULL; | ||
293 | a->states_tail = NULL; | ||
294 | a->state_count = 0; | ||
295 | GNUNET_free (a); | ||
296 | } | ||
297 | |||
298 | |||
299 | /** | ||
300 | * Frees the memory used by State @a s | ||
301 | * | ||
302 | * @param s state that should be destroyed | ||
303 | */ | ||
304 | static void | ||
305 | automaton_destroy_state (struct REGEX_INTERNAL_State *s) | ||
306 | { | ||
307 | struct REGEX_INTERNAL_Transition *t; | ||
308 | struct REGEX_INTERNAL_Transition *next_t; | ||
309 | |||
310 | if (NULL == s) | ||
311 | return; | ||
312 | |||
313 | GNUNET_free (s->name); | ||
314 | GNUNET_free (s->proof); | ||
315 | state_set_clear (&s->nfa_set); | ||
316 | for (t = s->transitions_head; NULL != t; t = next_t) | ||
317 | { | ||
318 | next_t = t->next; | ||
319 | state_remove_transition (s, t); | ||
320 | } | ||
321 | |||
322 | GNUNET_free (s); | ||
323 | } | ||
324 | |||
325 | |||
326 | /** | ||
327 | * Remove a state from the given automaton 'a'. Always use this function when | ||
328 | * altering the states of an automaton. Will also remove all transitions leading | ||
329 | * to this state, before destroying it. | ||
330 | * | ||
331 | * @param a automaton | ||
332 | * @param s state to remove | ||
333 | */ | ||
334 | static void | ||
335 | automaton_remove_state (struct REGEX_INTERNAL_Automaton *a, | ||
336 | struct REGEX_INTERNAL_State *s) | ||
337 | { | ||
338 | struct REGEX_INTERNAL_State *s_check; | ||
339 | struct REGEX_INTERNAL_Transition *t_check; | ||
340 | struct REGEX_INTERNAL_Transition *t_check_next; | ||
341 | |||
342 | if ((NULL == a) || (NULL == s)) | ||
343 | return; | ||
344 | |||
345 | /* remove all transitions leading to this state */ | ||
346 | for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) | ||
347 | { | ||
348 | for (t_check = s_check->transitions_head; NULL != t_check; | ||
349 | t_check = t_check_next) | ||
350 | { | ||
351 | t_check_next = t_check->next; | ||
352 | if (t_check->to_state == s) | ||
353 | state_remove_transition (s_check, t_check); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | /* remove state */ | ||
358 | GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s); | ||
359 | a->state_count--; | ||
360 | |||
361 | automaton_destroy_state (s); | ||
362 | } | ||
363 | |||
364 | |||
365 | /** | ||
366 | * Merge two states into one. Will merge 's1' and 's2' into 's1' and destroy | ||
367 | * 's2'. 's1' will contain all (non-duplicate) outgoing transitions of 's2'. | ||
368 | * | ||
369 | * @param ctx context | ||
370 | * @param a automaton | ||
371 | * @param s1 first state | ||
372 | * @param s2 second state, will be destroyed | ||
373 | */ | ||
374 | static void | ||
375 | automaton_merge_states (struct REGEX_INTERNAL_Context *ctx, | ||
376 | struct REGEX_INTERNAL_Automaton *a, | ||
377 | struct REGEX_INTERNAL_State *s1, | ||
378 | struct REGEX_INTERNAL_State *s2) | ||
379 | { | ||
380 | struct REGEX_INTERNAL_State *s_check; | ||
381 | struct REGEX_INTERNAL_Transition *t_check; | ||
382 | struct REGEX_INTERNAL_Transition *t; | ||
383 | struct REGEX_INTERNAL_Transition *t_next; | ||
384 | int is_dup; | ||
385 | |||
386 | if (s1 == s2) | ||
387 | return; | ||
388 | |||
389 | /* 1. Make all transitions pointing to s2 point to s1, unless this transition | ||
390 | * does not already exists, if it already exists remove transition. */ | ||
391 | for (s_check = a->states_head; NULL != s_check; s_check = s_check->next) | ||
392 | { | ||
393 | for (t_check = s_check->transitions_head; NULL != t_check; t_check = t_next) | ||
394 | { | ||
395 | t_next = t_check->next; | ||
396 | |||
397 | if (s2 == t_check->to_state) | ||
398 | { | ||
399 | is_dup = GNUNET_NO; | ||
400 | for (t = t_check->from_state->transitions_head; NULL != t; t = t->next) | ||
401 | { | ||
402 | if ((t->to_state == s1) && (0 == strcmp (t_check->label, t->label)) ) | ||
403 | is_dup = GNUNET_YES; | ||
404 | } | ||
405 | if (GNUNET_NO == is_dup) | ||
406 | t_check->to_state = s1; | ||
407 | else | ||
408 | state_remove_transition (t_check->from_state, t_check); | ||
409 | } | ||
410 | } | ||
411 | } | ||
412 | |||
413 | /* 2. Add all transitions from s2 to sX to s1 */ | ||
414 | for (t_check = s2->transitions_head; NULL != t_check; t_check = t_check->next) | ||
415 | { | ||
416 | if (t_check->to_state != s1) | ||
417 | state_add_transition (ctx, s1, t_check->label, t_check->to_state); | ||
418 | } | ||
419 | |||
420 | /* 3. Rename s1 to {s1,s2} */ | ||
421 | #if REGEX_DEBUG_DFA | ||
422 | char *new_name; | ||
423 | |||
424 | new_name = s1->name; | ||
425 | GNUNET_asprintf (&s1->name, "{%s,%s}", new_name, s2->name); | ||
426 | GNUNET_free (new_name); | ||
427 | #endif | ||
428 | |||
429 | /* remove state */ | ||
430 | GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s2); | ||
431 | a->state_count--; | ||
432 | automaton_destroy_state (s2); | ||
433 | } | ||
434 | |||
435 | |||
436 | /** | ||
437 | * Add a state to the automaton 'a', always use this function to alter the | ||
438 | * states DLL of the automaton. | ||
439 | * | ||
440 | * @param a automaton to add the state to | ||
441 | * @param s state that should be added | ||
442 | */ | ||
443 | static void | ||
444 | automaton_add_state (struct REGEX_INTERNAL_Automaton *a, | ||
445 | struct REGEX_INTERNAL_State *s) | ||
446 | { | ||
447 | GNUNET_CONTAINER_DLL_insert (a->states_head, a->states_tail, s); | ||
448 | a->state_count++; | ||
449 | } | ||
450 | |||
451 | |||
452 | /** | ||
453 | * Depth-first traversal (DFS) of all states that are reachable from state | ||
454 | * 's'. Performs 'action' on each visited state. | ||
455 | * | ||
456 | * @param s start state. | ||
457 | * @param marks an array of size a->state_count to remember which state was | ||
458 | * already visited. | ||
459 | * @param count current count of the state. | ||
460 | * @param check function that is checked before advancing on each transition | ||
461 | * in the DFS. | ||
462 | * @param check_cls closure for check. | ||
463 | * @param action action to be performed on each state. | ||
464 | * @param action_cls closure for action. | ||
465 | */ | ||
466 | static void | ||
467 | automaton_state_traverse (struct REGEX_INTERNAL_State *s, | ||
468 | int *marks, | ||
469 | unsigned int *count, | ||
470 | REGEX_INTERNAL_traverse_check check, | ||
471 | void *check_cls, | ||
472 | REGEX_INTERNAL_traverse_action action, | ||
473 | void *action_cls) | ||
474 | { | ||
475 | struct REGEX_INTERNAL_Transition *t; | ||
476 | |||
477 | if (GNUNET_YES == marks[s->traversal_id]) | ||
478 | return; | ||
479 | |||
480 | marks[s->traversal_id] = GNUNET_YES; | ||
481 | |||
482 | if (NULL != action) | ||
483 | action (action_cls, *count, s); | ||
484 | |||
485 | (*count)++; | ||
486 | |||
487 | for (t = s->transitions_head; NULL != t; t = t->next) | ||
488 | { | ||
489 | if ((NULL == check) || | ||
490 | ((NULL != check) && (GNUNET_YES == check (check_cls, s, t)) )) | ||
491 | { | ||
492 | automaton_state_traverse (t->to_state, | ||
493 | marks, | ||
494 | count, | ||
495 | check, | ||
496 | check_cls, | ||
497 | action, | ||
498 | action_cls); | ||
499 | } | ||
500 | } | ||
501 | } | ||
502 | |||
503 | |||
504 | /** | ||
505 | * Traverses the given automaton using depth-first-search (DFS) from it's start | ||
506 | * state, visiting all reachable states and calling 'action' on each one of | ||
507 | * them. | ||
508 | * | ||
509 | * @param a automaton to be traversed. | ||
510 | * @param start start state, pass a->start or NULL to traverse the whole automaton. | ||
511 | * @param check function that is checked before advancing on each transition | ||
512 | * in the DFS. | ||
513 | * @param check_cls closure for @a check. | ||
514 | * @param action action to be performed on each state. | ||
515 | * @param action_cls closure for @a action | ||
516 | */ | ||
517 | void | ||
518 | REGEX_INTERNAL_automaton_traverse (const struct REGEX_INTERNAL_Automaton *a, | ||
519 | struct REGEX_INTERNAL_State *start, | ||
520 | REGEX_INTERNAL_traverse_check check, | ||
521 | void *check_cls, | ||
522 | REGEX_INTERNAL_traverse_action action, | ||
523 | void *action_cls) | ||
524 | { | ||
525 | unsigned int count; | ||
526 | struct REGEX_INTERNAL_State *s; | ||
527 | |||
528 | if ((NULL == a) || (0 == a->state_count)) | ||
529 | return; | ||
530 | |||
531 | int marks[a->state_count]; | ||
532 | |||
533 | for (count = 0, s = a->states_head; NULL != s && count < a->state_count; | ||
534 | s = s->next, count++) | ||
535 | { | ||
536 | s->traversal_id = count; | ||
537 | marks[s->traversal_id] = GNUNET_NO; | ||
538 | } | ||
539 | |||
540 | count = 0; | ||
541 | |||
542 | if (NULL == start) | ||
543 | s = a->start; | ||
544 | else | ||
545 | s = start; | ||
546 | |||
547 | automaton_state_traverse (s, | ||
548 | marks, | ||
549 | &count, | ||
550 | check, | ||
551 | check_cls, | ||
552 | action, | ||
553 | action_cls); | ||
554 | } | ||
555 | |||
556 | |||
557 | /** | ||
558 | * String container for faster string operations. | ||
559 | */ | ||
560 | struct StringBuffer | ||
561 | { | ||
562 | /** | ||
563 | * Buffer holding the string (may start in the middle!); | ||
564 | * NOT 0-terminated! | ||
565 | */ | ||
566 | char *sbuf; | ||
567 | |||
568 | /** | ||
569 | * Allocated buffer. | ||
570 | */ | ||
571 | char *abuf; | ||
572 | |||
573 | /** | ||
574 | * Length of the string in the buffer. | ||
575 | */ | ||
576 | size_t slen; | ||
577 | |||
578 | /** | ||
579 | * Number of bytes allocated for @e sbuf | ||
580 | */ | ||
581 | unsigned int blen; | ||
582 | |||
583 | /** | ||
584 | * Buffer currently represents "NULL" (not the empty string!) | ||
585 | */ | ||
586 | int16_t null_flag; | ||
587 | |||
588 | /** | ||
589 | * If this entry is part of the last/current generation array, | ||
590 | * this flag is #GNUNET_YES if the last and current generation are | ||
591 | * identical (and thus copying is unnecessary if the value didn't | ||
592 | * change). This is used in an optimization that improves | ||
593 | * performance by about 1% --- if we use int16_t here. With just | ||
594 | * "int" for both flags, performance drops (on my system) significantly, | ||
595 | * most likely due to increased cache misses. | ||
596 | */ | ||
597 | int16_t synced; | ||
598 | }; | ||
599 | |||
600 | |||
601 | /** | ||
602 | * Compare two strings for equality. If either is NULL they are not equal. | ||
603 | * | ||
604 | * @param s1 first string for comparison. | ||
605 | * @param s2 second string for comparison. | ||
606 | * | ||
607 | * @return 0 if the strings are the same or both NULL, 1 or -1 if not. | ||
608 | */ | ||
609 | static int | ||
610 | sb_nullstrcmp (const struct StringBuffer *s1, const struct StringBuffer *s2) | ||
611 | { | ||
612 | if ((GNUNET_YES == s1->null_flag) && (GNUNET_YES == s2->null_flag)) | ||
613 | return 0; | ||
614 | if ((GNUNET_YES == s1->null_flag) || (GNUNET_YES == s2->null_flag)) | ||
615 | return -1; | ||
616 | if (s1->slen != s2->slen) | ||
617 | return -1; | ||
618 | if (0 == s1->slen) | ||
619 | return 0; | ||
620 | return memcmp (s1->sbuf, s2->sbuf, s1->slen); | ||
621 | } | ||
622 | |||
623 | |||
624 | /** | ||
625 | * Compare two strings for equality. | ||
626 | * | ||
627 | * @param s1 first string for comparison. | ||
628 | * @param s2 second string for comparison. | ||
629 | * | ||
630 | * @return 0 if the strings are the same, 1 or -1 if not. | ||
631 | */ | ||
632 | static int | ||
633 | sb_strcmp (const struct StringBuffer *s1, const struct StringBuffer *s2) | ||
634 | { | ||
635 | if (s1->slen != s2->slen) | ||
636 | return -1; | ||
637 | if (0 == s1->slen) | ||
638 | return 0; | ||
639 | return memcmp (s1->sbuf, s2->sbuf, s1->slen); | ||
640 | } | ||
641 | |||
642 | |||
643 | /** | ||
644 | * Reallocate the buffer of 'ret' to fit 'nlen' characters; | ||
645 | * move the existing string to the beginning of the new buffer. | ||
646 | * | ||
647 | * @param ret current buffer, to be updated | ||
648 | * @param nlen target length for the buffer, must be at least ret->slen | ||
649 | */ | ||
650 | static void | ||
651 | sb_realloc (struct StringBuffer *ret, size_t nlen) | ||
652 | { | ||
653 | char *old; | ||
654 | |||
655 | GNUNET_assert (nlen >= ret->slen); | ||
656 | old = ret->abuf; | ||
657 | ret->abuf = GNUNET_malloc (nlen); | ||
658 | ret->blen = nlen; | ||
659 | GNUNET_memcpy (ret->abuf, ret->sbuf, ret->slen); | ||
660 | ret->sbuf = ret->abuf; | ||
661 | GNUNET_free (old); | ||
662 | } | ||
663 | |||
664 | |||
665 | /** | ||
666 | * Append a string. | ||
667 | * | ||
668 | * @param ret where to write the result | ||
669 | * @param sarg string to append | ||
670 | */ | ||
671 | static void | ||
672 | sb_append (struct StringBuffer *ret, const struct StringBuffer *sarg) | ||
673 | { | ||
674 | if (GNUNET_YES == ret->null_flag) | ||
675 | ret->slen = 0; | ||
676 | ret->null_flag = GNUNET_NO; | ||
677 | if (ret->blen < sarg->slen + ret->slen) | ||
678 | sb_realloc (ret, ret->blen + sarg->slen + 128); | ||
679 | GNUNET_memcpy (&ret->sbuf[ret->slen], sarg->sbuf, sarg->slen); | ||
680 | ret->slen += sarg->slen; | ||
681 | } | ||
682 | |||
683 | |||
684 | /** | ||
685 | * Append a C string. | ||
686 | * | ||
687 | * @param ret where to write the result | ||
688 | * @param cstr string to append | ||
689 | */ | ||
690 | static void | ||
691 | sb_append_cstr (struct StringBuffer *ret, const char *cstr) | ||
692 | { | ||
693 | size_t cstr_len = strlen (cstr); | ||
694 | |||
695 | if (GNUNET_YES == ret->null_flag) | ||
696 | ret->slen = 0; | ||
697 | ret->null_flag = GNUNET_NO; | ||
698 | if (ret->blen < cstr_len + ret->slen) | ||
699 | sb_realloc (ret, ret->blen + cstr_len + 128); | ||
700 | GNUNET_memcpy (&ret->sbuf[ret->slen], cstr, cstr_len); | ||
701 | ret->slen += cstr_len; | ||
702 | } | ||
703 | |||
704 | |||
705 | /** | ||
706 | * Wrap a string buffer, that is, set ret to the format string | ||
707 | * which contains an "%s" which is to be replaced with the original | ||
708 | * content of 'ret'. Note that optimizing this function is not | ||
709 | * really worth it, it is rarely called. | ||
710 | * | ||
711 | * @param ret where to write the result and take the input for %.*s from | ||
712 | * @param format format string, fprintf-style, with exactly one "%.*s" | ||
713 | * @param extra_chars how long will the result be, in addition to 'sarg' length | ||
714 | */ | ||
715 | static void | ||
716 | sb_wrap (struct StringBuffer *ret, const char *format, size_t extra_chars) | ||
717 | { | ||
718 | char *temp; | ||
719 | |||
720 | if (GNUNET_YES == ret->null_flag) | ||
721 | ret->slen = 0; | ||
722 | ret->null_flag = GNUNET_NO; | ||
723 | temp = GNUNET_malloc (ret->slen + extra_chars + 1); | ||
724 | GNUNET_snprintf (temp, | ||
725 | ret->slen + extra_chars + 1, | ||
726 | format, | ||
727 | (int) ret->slen, | ||
728 | ret->sbuf); | ||
729 | GNUNET_free (ret->abuf); | ||
730 | ret->abuf = temp; | ||
731 | ret->sbuf = temp; | ||
732 | ret->blen = ret->slen + extra_chars + 1; | ||
733 | ret->slen = ret->slen + extra_chars; | ||
734 | } | ||
735 | |||
736 | |||
737 | /** | ||
738 | * Format a string buffer. Note that optimizing this function is not | ||
739 | * really worth it, it is rarely called. | ||
740 | * | ||
741 | * @param ret where to write the result | ||
742 | * @param format format string, fprintf-style, with exactly one "%.*s" | ||
743 | * @param extra_chars how long will the result be, in addition to 'sarg' length | ||
744 | * @param sarg string to print into the format | ||
745 | */ | ||
746 | static void | ||
747 | sb_printf1 (struct StringBuffer *ret, | ||
748 | const char *format, | ||
749 | size_t extra_chars, | ||
750 | const struct StringBuffer *sarg) | ||
751 | { | ||
752 | if (ret->blen < sarg->slen + extra_chars + 1) | ||
753 | sb_realloc (ret, sarg->slen + extra_chars + 1); | ||
754 | ret->null_flag = GNUNET_NO; | ||
755 | ret->sbuf = ret->abuf; | ||
756 | ret->slen = sarg->slen + extra_chars; | ||
757 | GNUNET_snprintf (ret->sbuf, ret->blen, format, (int) sarg->slen, sarg->sbuf); | ||
758 | } | ||
759 | |||
760 | |||
761 | /** | ||
762 | * Format a string buffer. | ||
763 | * | ||
764 | * @param ret where to write the result | ||
765 | * @param format format string, fprintf-style, with exactly two "%.*s" | ||
766 | * @param extra_chars how long will the result be, in addition to 'sarg1/2' length | ||
767 | * @param sarg1 first string to print into the format | ||
768 | * @param sarg2 second string to print into the format | ||
769 | */ | ||
770 | static void | ||
771 | sb_printf2 (struct StringBuffer *ret, | ||
772 | const char *format, | ||
773 | size_t extra_chars, | ||
774 | const struct StringBuffer *sarg1, | ||
775 | const struct StringBuffer *sarg2) | ||
776 | { | ||
777 | if (ret->blen < sarg1->slen + sarg2->slen + extra_chars + 1) | ||
778 | sb_realloc (ret, sarg1->slen + sarg2->slen + extra_chars + 1); | ||
779 | ret->null_flag = GNUNET_NO; | ||
780 | ret->slen = sarg1->slen + sarg2->slen + extra_chars; | ||
781 | ret->sbuf = ret->abuf; | ||
782 | GNUNET_snprintf (ret->sbuf, | ||
783 | ret->blen, | ||
784 | format, | ||
785 | (int) sarg1->slen, | ||
786 | sarg1->sbuf, | ||
787 | (int) sarg2->slen, | ||
788 | sarg2->sbuf); | ||
789 | } | ||
790 | |||
791 | |||
792 | /** | ||
793 | * Format a string buffer. Note that optimizing this function is not | ||
794 | * really worth it, it is rarely called. | ||
795 | * | ||
796 | * @param ret where to write the result | ||
797 | * @param format format string, fprintf-style, with exactly three "%.*s" | ||
798 | * @param extra_chars how long will the result be, in addition to 'sarg1/2/3' length | ||
799 | * @param sarg1 first string to print into the format | ||
800 | * @param sarg2 second string to print into the format | ||
801 | * @param sarg3 third string to print into the format | ||
802 | */ | ||
803 | static void | ||
804 | sb_printf3 (struct StringBuffer *ret, | ||
805 | const char *format, | ||
806 | size_t extra_chars, | ||
807 | const struct StringBuffer *sarg1, | ||
808 | const struct StringBuffer *sarg2, | ||
809 | const struct StringBuffer *sarg3) | ||
810 | { | ||
811 | if (ret->blen < sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1) | ||
812 | sb_realloc (ret, sarg1->slen + sarg2->slen + sarg3->slen + extra_chars + 1); | ||
813 | ret->null_flag = GNUNET_NO; | ||
814 | ret->slen = sarg1->slen + sarg2->slen + sarg3->slen + extra_chars; | ||
815 | ret->sbuf = ret->abuf; | ||
816 | GNUNET_snprintf (ret->sbuf, | ||
817 | ret->blen, | ||
818 | format, | ||
819 | (int) sarg1->slen, | ||
820 | sarg1->sbuf, | ||
821 | (int) sarg2->slen, | ||
822 | sarg2->sbuf, | ||
823 | (int) sarg3->slen, | ||
824 | sarg3->sbuf); | ||
825 | } | ||
826 | |||
827 | |||
828 | /** | ||
829 | * Free resources of the given string buffer. | ||
830 | * | ||
831 | * @param sb buffer to free (actual pointer is not freed, as they | ||
832 | * should not be individually allocated) | ||
833 | */ | ||
834 | static void | ||
835 | sb_free (struct StringBuffer *sb) | ||
836 | { | ||
837 | GNUNET_array_grow (sb->abuf, sb->blen, 0); | ||
838 | sb->slen = 0; | ||
839 | sb->sbuf = NULL; | ||
840 | sb->null_flag = GNUNET_YES; | ||
841 | } | ||
842 | |||
843 | |||
844 | /** | ||
845 | * Copy the given string buffer from 'in' to 'out'. | ||
846 | * | ||
847 | * @param in input string | ||
848 | * @param out output string | ||
849 | */ | ||
850 | static void | ||
851 | sb_strdup (struct StringBuffer *out, const struct StringBuffer *in) | ||
852 | |||
853 | { | ||
854 | out->null_flag = in->null_flag; | ||
855 | if (GNUNET_YES == out->null_flag) | ||
856 | return; | ||
857 | if (out->blen < in->slen) | ||
858 | { | ||
859 | GNUNET_array_grow (out->abuf, out->blen, in->slen); | ||
860 | } | ||
861 | out->sbuf = out->abuf; | ||
862 | out->slen = in->slen; | ||
863 | GNUNET_memcpy (out->sbuf, in->sbuf, out->slen); | ||
864 | } | ||
865 | |||
866 | |||
867 | /** | ||
868 | * Copy the given string buffer from 'in' to 'out'. | ||
869 | * | ||
870 | * @param cstr input string | ||
871 | * @param out output string | ||
872 | */ | ||
873 | static void | ||
874 | sb_strdup_cstr (struct StringBuffer *out, const char *cstr) | ||
875 | { | ||
876 | if (NULL == cstr) | ||
877 | { | ||
878 | out->null_flag = GNUNET_YES; | ||
879 | return; | ||
880 | } | ||
881 | out->null_flag = GNUNET_NO; | ||
882 | out->slen = strlen (cstr); | ||
883 | if (out->blen < out->slen) | ||
884 | { | ||
885 | GNUNET_array_grow (out->abuf, out->blen, out->slen); | ||
886 | } | ||
887 | out->sbuf = out->abuf; | ||
888 | GNUNET_memcpy (out->sbuf, cstr, out->slen); | ||
889 | } | ||
890 | |||
891 | |||
892 | /** | ||
893 | * Check if the given string @a str needs parentheses around it when | ||
894 | * using it to generate a regex. | ||
895 | * | ||
896 | * @param str string | ||
897 | * | ||
898 | * @return #GNUNET_YES if parentheses are needed, #GNUNET_NO otherwise | ||
899 | */ | ||
900 | static int | ||
901 | needs_parentheses (const struct StringBuffer *str) | ||
902 | { | ||
903 | size_t slen; | ||
904 | const char *op; | ||
905 | const char *cl; | ||
906 | const char *pos; | ||
907 | const char *end; | ||
908 | unsigned int cnt; | ||
909 | |||
910 | if ((GNUNET_YES == str->null_flag) || ((slen = str->slen) < 2)) | ||
911 | return GNUNET_NO; | ||
912 | pos = str->sbuf; | ||
913 | if ('(' != pos[0]) | ||
914 | return GNUNET_YES; | ||
915 | end = str->sbuf + slen; | ||
916 | cnt = 1; | ||
917 | pos++; | ||
918 | while (cnt > 0) | ||
919 | { | ||
920 | cl = memchr (pos, ')', end - pos); | ||
921 | if (NULL == cl) | ||
922 | { | ||
923 | GNUNET_break (0); | ||
924 | return GNUNET_YES; | ||
925 | } | ||
926 | /* while '(' before ')', count opening parens */ | ||
927 | while ((NULL != (op = memchr (pos, '(', end - pos))) && (op < cl)) | ||
928 | { | ||
929 | cnt++; | ||
930 | pos = op + 1; | ||
931 | } | ||
932 | /* got ')' first */ | ||
933 | cnt--; | ||
934 | pos = cl + 1; | ||
935 | } | ||
936 | return (*pos == '\0') ? GNUNET_NO : GNUNET_YES; | ||
937 | } | ||
938 | |||
939 | |||
940 | /** | ||
941 | * Remove parentheses surrounding string @a str. | ||
942 | * Example: "(a)" becomes "a", "(a|b)|(a|c)" stays the same. | ||
943 | * You need to #GNUNET_free() the returned string. | ||
944 | * | ||
945 | * @param str string, modified to contain a | ||
946 | * @return string without surrounding parentheses, string 'str' if no preceding | ||
947 | * epsilon could be found, NULL if 'str' was NULL | ||
948 | */ | ||
949 | static void | ||
950 | remove_parentheses (struct StringBuffer *str) | ||
951 | { | ||
952 | size_t slen; | ||
953 | const char *pos; | ||
954 | const char *end; | ||
955 | const char *sbuf; | ||
956 | const char *op; | ||
957 | const char *cp; | ||
958 | unsigned int cnt; | ||
959 | |||
960 | if (0) | ||
961 | return; | ||
962 | sbuf = str->sbuf; | ||
963 | if ((GNUNET_YES == str->null_flag) || (1 >= (slen = str->slen)) || | ||
964 | ('(' != str->sbuf[0]) || (')' != str->sbuf[slen - 1])) | ||
965 | return; | ||
966 | cnt = 0; | ||
967 | pos = &sbuf[1]; | ||
968 | end = &sbuf[slen - 1]; | ||
969 | op = memchr (pos, '(', end - pos); | ||
970 | cp = memchr (pos, ')', end - pos); | ||
971 | while (NULL != cp) | ||
972 | { | ||
973 | while ((NULL != op) && (op < cp)) | ||
974 | { | ||
975 | cnt++; | ||
976 | pos = op + 1; | ||
977 | op = memchr (pos, '(', end - pos); | ||
978 | } | ||
979 | while ((NULL != cp) && ((NULL == op) || (cp < op))) | ||
980 | { | ||
981 | if (0 == cnt) | ||
982 | return; /* can't strip parens */ | ||
983 | cnt--; | ||
984 | pos = cp + 1; | ||
985 | cp = memchr (pos, ')', end - pos); | ||
986 | } | ||
987 | } | ||
988 | if (0 != cnt) | ||
989 | { | ||
990 | GNUNET_break (0); | ||
991 | return; | ||
992 | } | ||
993 | str->sbuf++; | ||
994 | str->slen -= 2; | ||
995 | } | ||
996 | |||
997 | |||
998 | /** | ||
999 | * Check if the string 'str' starts with an epsilon (empty string). | ||
1000 | * Example: "(|a)" is starting with an epsilon. | ||
1001 | * | ||
1002 | * @param str string to test | ||
1003 | * | ||
1004 | * @return 0 if str has no epsilon, 1 if str starts with '(|' and ends with ')' | ||
1005 | */ | ||
1006 | static int | ||
1007 | has_epsilon (const struct StringBuffer *str) | ||
1008 | { | ||
1009 | return (GNUNET_YES != str->null_flag) && (0 < str->slen) && | ||
1010 | ('(' == str->sbuf[0]) && ('|' == str->sbuf[1]) && | ||
1011 | (')' == str->sbuf[str->slen - 1]); | ||
1012 | } | ||
1013 | |||
1014 | |||
1015 | /** | ||
1016 | * Remove an epsilon from the string str. Where epsilon is an empty string | ||
1017 | * Example: str = "(|a|b|c)", result: "a|b|c" | ||
1018 | * The returned string needs to be freed. | ||
1019 | * | ||
1020 | * @param str original string | ||
1021 | * @param ret where to return string without preceding epsilon, string 'str' if no preceding | ||
1022 | * epsilon could be found, NULL if 'str' was NULL | ||
1023 | */ | ||
1024 | static void | ||
1025 | remove_epsilon (const struct StringBuffer *str, struct StringBuffer *ret) | ||
1026 | { | ||
1027 | if (GNUNET_YES == str->null_flag) | ||
1028 | { | ||
1029 | ret->null_flag = GNUNET_YES; | ||
1030 | return; | ||
1031 | } | ||
1032 | if ((str->slen > 1) && ('(' == str->sbuf[0]) && ('|' == str->sbuf[1]) && | ||
1033 | (')' == str->sbuf[str->slen - 1])) | ||
1034 | { | ||
1035 | /* remove epsilon */ | ||
1036 | if (ret->blen < str->slen - 3) | ||
1037 | { | ||
1038 | GNUNET_array_grow (ret->abuf, ret->blen, str->slen - 3); | ||
1039 | } | ||
1040 | ret->sbuf = ret->abuf; | ||
1041 | ret->slen = str->slen - 3; | ||
1042 | GNUNET_memcpy (ret->sbuf, &str->sbuf[2], ret->slen); | ||
1043 | return; | ||
1044 | } | ||
1045 | sb_strdup (ret, str); | ||
1046 | } | ||
1047 | |||
1048 | |||
1049 | /** | ||
1050 | * Compare n bytes of 'str1' and 'str2' | ||
1051 | * | ||
1052 | * @param str1 first string to compare | ||
1053 | * @param str2 second string for comparison | ||
1054 | * @param n number of bytes to compare | ||
1055 | * | ||
1056 | * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise | ||
1057 | */ | ||
1058 | static int | ||
1059 | sb_strncmp (const struct StringBuffer *str1, | ||
1060 | const struct StringBuffer *str2, | ||
1061 | size_t n) | ||
1062 | { | ||
1063 | size_t max; | ||
1064 | |||
1065 | if ((str1->slen != str2->slen) && ((str1->slen < n) || (str2->slen < n))) | ||
1066 | return -1; | ||
1067 | max = GNUNET_MAX (str1->slen, str2->slen); | ||
1068 | if (max > n) | ||
1069 | max = n; | ||
1070 | return memcmp (str1->sbuf, str2->sbuf, max); | ||
1071 | } | ||
1072 | |||
1073 | |||
1074 | /** | ||
1075 | * Compare n bytes of 'str1' and 'str2' | ||
1076 | * | ||
1077 | * @param str1 first string to compare | ||
1078 | * @param str2 second C string for comparison | ||
1079 | * @param n number of bytes to compare (and length of str2) | ||
1080 | * | ||
1081 | * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise | ||
1082 | */ | ||
1083 | static int | ||
1084 | sb_strncmp_cstr (const struct StringBuffer *str1, const char *str2, size_t n) | ||
1085 | { | ||
1086 | if (str1->slen < n) | ||
1087 | return -1; | ||
1088 | return memcmp (str1->sbuf, str2, n); | ||
1089 | } | ||
1090 | |||
1091 | |||
1092 | /** | ||
1093 | * Initialize string buffer for storing strings of up to n | ||
1094 | * characters. | ||
1095 | * | ||
1096 | * @param sb buffer to initialize | ||
1097 | * @param n desired target length | ||
1098 | */ | ||
1099 | static void | ||
1100 | sb_init (struct StringBuffer *sb, size_t n) | ||
1101 | { | ||
1102 | sb->null_flag = GNUNET_NO; | ||
1103 | sb->abuf = sb->sbuf = (0 == n) ? NULL : GNUNET_malloc (n); | ||
1104 | sb->blen = n; | ||
1105 | sb->slen = 0; | ||
1106 | } | ||
1107 | |||
1108 | |||
1109 | /** | ||
1110 | * Compare 'str1', starting from position 'k', with whole 'str2' | ||
1111 | * | ||
1112 | * @param str1 first string to compare, starting from position 'k' | ||
1113 | * @param str2 second string for comparison | ||
1114 | * @param k starting position in 'str1' | ||
1115 | * | ||
1116 | * @return -1 if any of the strings is NULL, 0 if equal, non 0 otherwise | ||
1117 | */ | ||
1118 | static int | ||
1119 | sb_strkcmp (const struct StringBuffer *str1, | ||
1120 | const struct StringBuffer *str2, | ||
1121 | size_t k) | ||
1122 | { | ||
1123 | if ((GNUNET_YES == str1->null_flag) || (GNUNET_YES == str2->null_flag) || | ||
1124 | (k > str1->slen) || (str1->slen - k != str2->slen)) | ||
1125 | return -1; | ||
1126 | return memcmp (&str1->sbuf[k], str2->sbuf, str2->slen); | ||
1127 | } | ||
1128 | |||
1129 | |||
1130 | /** | ||
1131 | * Helper function used as 'action' in 'REGEX_INTERNAL_automaton_traverse' | ||
1132 | * function to create the depth-first numbering of the states. | ||
1133 | * | ||
1134 | * @param cls states array. | ||
1135 | * @param count current state counter. | ||
1136 | * @param s current state. | ||
1137 | */ | ||
1138 | static void | ||
1139 | number_states (void *cls, | ||
1140 | const unsigned int count, | ||
1141 | struct REGEX_INTERNAL_State *s) | ||
1142 | { | ||
1143 | struct REGEX_INTERNAL_State **states = cls; | ||
1144 | |||
1145 | s->dfs_id = count; | ||
1146 | if (NULL != states) | ||
1147 | states[count] = s; | ||
1148 | } | ||
1149 | |||
1150 | |||
1151 | #define PRIS(a) \ | ||
1152 | ((GNUNET_YES == a.null_flag) ? 6 : (int) a.slen), \ | ||
1153 | ((GNUNET_YES == a.null_flag) ? "(null)" : a.sbuf) | ||
1154 | |||
1155 | |||
1156 | /** | ||
1157 | * Construct the regular expression given the inductive step, | ||
1158 | * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* | ||
1159 | * R^{(k-1)}_{kj}, and simplify the resulting expression saved in R_cur_ij. | ||
1160 | * | ||
1161 | * @param R_last_ij value of $R^{(k-1)_{ij}. | ||
1162 | * @param R_last_ik value of $R^{(k-1)_{ik}. | ||
1163 | * @param R_last_kk value of $R^{(k-1)_{kk}. | ||
1164 | * @param R_last_kj value of $R^{(k-1)_{kj}. | ||
1165 | * @param R_cur_ij result for this inductive step is saved in R_cur_ij, R_cur_ij | ||
1166 | * is expected to be NULL when called! | ||
1167 | * @param R_cur_l optimization -- kept between iterations to avoid realloc | ||
1168 | * @param R_cur_r optimization -- kept between iterations to avoid realloc | ||
1169 | */ | ||
1170 | static void | ||
1171 | automaton_create_proofs_simplify (const struct StringBuffer *R_last_ij, | ||
1172 | const struct StringBuffer *R_last_ik, | ||
1173 | const struct StringBuffer *R_last_kk, | ||
1174 | const struct StringBuffer *R_last_kj, | ||
1175 | struct StringBuffer *R_cur_ij, | ||
1176 | struct StringBuffer *R_cur_l, | ||
1177 | struct StringBuffer *R_cur_r) | ||
1178 | { | ||
1179 | struct StringBuffer R_temp_ij; | ||
1180 | struct StringBuffer R_temp_ik; | ||
1181 | struct StringBuffer R_temp_kj; | ||
1182 | struct StringBuffer R_temp_kk; | ||
1183 | int eps_check; | ||
1184 | int ij_ik_cmp; | ||
1185 | int ij_kj_cmp; | ||
1186 | int ik_kk_cmp; | ||
1187 | int kk_kj_cmp; | ||
1188 | int clean_ik_kk_cmp; | ||
1189 | int clean_kk_kj_cmp; | ||
1190 | size_t length; | ||
1191 | size_t length_l; | ||
1192 | size_t length_r; | ||
1193 | |||
1194 | /* | ||
1195 | * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} | ||
1196 | * R_last == R^{(k-1)}, R_cur == R^{(k)} | ||
1197 | * R_cur_ij = R_cur_l | R_cur_r | ||
1198 | * R_cur_l == R^{(k-1)}_{ij} | ||
1199 | * R_cur_r == R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} | ||
1200 | */if ((GNUNET_YES == R_last_ij->null_flag) && | ||
1201 | ((GNUNET_YES == R_last_ik->null_flag) || | ||
1202 | (GNUNET_YES == R_last_kj->null_flag))) | ||
1203 | { | ||
1204 | /* R^{(k)}_{ij} = N | N */ | ||
1205 | R_cur_ij->null_flag = GNUNET_YES; | ||
1206 | R_cur_ij->synced = GNUNET_NO; | ||
1207 | return; | ||
1208 | } | ||
1209 | |||
1210 | if ((GNUNET_YES == R_last_ik->null_flag) || | ||
1211 | (GNUNET_YES == R_last_kj->null_flag)) | ||
1212 | { | ||
1213 | /* R^{(k)}_{ij} = R^{(k-1)}_{ij} | N */ | ||
1214 | if (GNUNET_YES == R_last_ij->synced) | ||
1215 | { | ||
1216 | R_cur_ij->synced = GNUNET_YES; | ||
1217 | R_cur_ij->null_flag = GNUNET_NO; | ||
1218 | return; | ||
1219 | } | ||
1220 | R_cur_ij->synced = GNUNET_YES; | ||
1221 | sb_strdup (R_cur_ij, R_last_ij); | ||
1222 | return; | ||
1223 | } | ||
1224 | R_cur_ij->synced = GNUNET_NO; | ||
1225 | |||
1226 | /* $R^{(k)}_{ij} = N | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} OR | ||
1227 | * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} */ | ||
1228 | |||
1229 | R_cur_r->null_flag = GNUNET_YES; | ||
1230 | R_cur_r->slen = 0; | ||
1231 | R_cur_l->null_flag = GNUNET_YES; | ||
1232 | R_cur_l->slen = 0; | ||
1233 | |||
1234 | /* cache results from strcmp, we might need these many times */ | ||
1235 | ij_kj_cmp = sb_nullstrcmp (R_last_ij, R_last_kj); | ||
1236 | ij_ik_cmp = sb_nullstrcmp (R_last_ij, R_last_ik); | ||
1237 | ik_kk_cmp = sb_nullstrcmp (R_last_ik, R_last_kk); | ||
1238 | kk_kj_cmp = sb_nullstrcmp (R_last_kk, R_last_kj); | ||
1239 | |||
1240 | /* Assign R_temp_(ik|kk|kj) to R_last[][] and remove epsilon as well | ||
1241 | * as parentheses, so we can better compare the contents */ | ||
1242 | |||
1243 | memset (&R_temp_ij, 0, sizeof(struct StringBuffer)); | ||
1244 | memset (&R_temp_ik, 0, sizeof(struct StringBuffer)); | ||
1245 | memset (&R_temp_kk, 0, sizeof(struct StringBuffer)); | ||
1246 | memset (&R_temp_kj, 0, sizeof(struct StringBuffer)); | ||
1247 | remove_epsilon (R_last_ik, &R_temp_ik); | ||
1248 | remove_epsilon (R_last_kk, &R_temp_kk); | ||
1249 | remove_epsilon (R_last_kj, &R_temp_kj); | ||
1250 | remove_parentheses (&R_temp_ik); | ||
1251 | remove_parentheses (&R_temp_kk); | ||
1252 | remove_parentheses (&R_temp_kj); | ||
1253 | clean_ik_kk_cmp = sb_nullstrcmp (R_last_ik, &R_temp_kk); | ||
1254 | clean_kk_kj_cmp = sb_nullstrcmp (&R_temp_kk, R_last_kj); | ||
1255 | |||
1256 | /* construct R_cur_l (and, if necessary R_cur_r) */ | ||
1257 | if (GNUNET_YES != R_last_ij->null_flag) | ||
1258 | { | ||
1259 | /* Assign R_temp_ij to R_last_ij and remove epsilon as well | ||
1260 | * as parentheses, so we can better compare the contents */ | ||
1261 | remove_epsilon (R_last_ij, &R_temp_ij); | ||
1262 | remove_parentheses (&R_temp_ij); | ||
1263 | |||
1264 | if ((0 == sb_strcmp (&R_temp_ij, &R_temp_ik)) && | ||
1265 | (0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) && | ||
1266 | (0 == sb_strcmp (&R_temp_kk, &R_temp_kj))) | ||
1267 | { | ||
1268 | if (0 == R_temp_ij.slen) | ||
1269 | { | ||
1270 | R_cur_r->null_flag = GNUNET_NO; | ||
1271 | } | ||
1272 | else if ((0 == sb_strncmp_cstr (R_last_ij, "(|", 2)) || | ||
1273 | ((0 == sb_strncmp_cstr (R_last_ik, "(|", 2)) && | ||
1274 | (0 == sb_strncmp_cstr (R_last_kj, "(|", 2)) )) | ||
1275 | { | ||
1276 | /* | ||
1277 | * a|(e|a)a*(e|a) = a* | ||
1278 | * a|(e|a)(e|a)*(e|a) = a* | ||
1279 | * (e|a)|aa*a = a* | ||
1280 | * (e|a)|aa*(e|a) = a* | ||
1281 | * (e|a)|(e|a)a*a = a* | ||
1282 | * (e|a)|(e|a)a*(e|a) = a* | ||
1283 | * (e|a)|(e|a)(e|a)*(e|a) = a* | ||
1284 | */if (GNUNET_YES == needs_parentheses (&R_temp_ij)) | ||
1285 | sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_ij); | ||
1286 | else | ||
1287 | sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_ij); | ||
1288 | } | ||
1289 | else | ||
1290 | { | ||
1291 | /* | ||
1292 | * a|aa*a = a+ | ||
1293 | * a|(e|a)a*a = a+ | ||
1294 | * a|aa*(e|a) = a+ | ||
1295 | * a|(e|a)(e|a)*a = a+ | ||
1296 | * a|a(e|a)*(e|a) = a+ | ||
1297 | */if (GNUNET_YES == needs_parentheses (&R_temp_ij)) | ||
1298 | sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_ij); | ||
1299 | else | ||
1300 | sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_ij); | ||
1301 | } | ||
1302 | } | ||
1303 | else if ((0 == ij_ik_cmp) && (0 == clean_kk_kj_cmp) && | ||
1304 | (0 != clean_ik_kk_cmp)) | ||
1305 | { | ||
1306 | /* a|ab*b = ab* */ | ||
1307 | if (0 == R_last_kk->slen) | ||
1308 | sb_strdup (R_cur_r, R_last_ij); | ||
1309 | else if (GNUNET_YES == needs_parentheses (&R_temp_kk)) | ||
1310 | sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk); | ||
1311 | else | ||
1312 | sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, R_last_kk); | ||
1313 | R_cur_l->null_flag = GNUNET_YES; | ||
1314 | } | ||
1315 | else if ((0 == ij_kj_cmp) && (0 == clean_ik_kk_cmp) && | ||
1316 | (0 != clean_kk_kj_cmp)) | ||
1317 | { | ||
1318 | /* a|bb*a = b*a */ | ||
1319 | if (R_last_kk->slen < 1) | ||
1320 | { | ||
1321 | sb_strdup (R_cur_r, R_last_kj); | ||
1322 | } | ||
1323 | else if (GNUNET_YES == needs_parentheses (&R_temp_kk)) | ||
1324 | sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj); | ||
1325 | else | ||
1326 | sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj); | ||
1327 | |||
1328 | R_cur_l->null_flag = GNUNET_YES; | ||
1329 | } | ||
1330 | else if ((0 == ij_ik_cmp) && (0 == kk_kj_cmp) && | ||
1331 | (! has_epsilon (R_last_ij)) && has_epsilon (R_last_kk)) | ||
1332 | { | ||
1333 | /* a|a(e|b)*(e|b) = a|ab* = a|a|ab|abb|abbb|... = ab* */ | ||
1334 | if (needs_parentheses (&R_temp_kk)) | ||
1335 | sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ij, &R_temp_kk); | ||
1336 | else | ||
1337 | sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ij, &R_temp_kk); | ||
1338 | R_cur_l->null_flag = GNUNET_YES; | ||
1339 | } | ||
1340 | else if ((0 == ij_kj_cmp) && (0 == ik_kk_cmp) && | ||
1341 | (! has_epsilon (R_last_ij)) && has_epsilon (R_last_kk)) | ||
1342 | { | ||
1343 | /* a|(e|b)(e|b)*a = a|b*a = a|a|ba|bba|bbba|... = b*a */ | ||
1344 | if (needs_parentheses (&R_temp_kk)) | ||
1345 | sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_ij); | ||
1346 | else | ||
1347 | sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_ij); | ||
1348 | R_cur_l->null_flag = GNUNET_YES; | ||
1349 | } | ||
1350 | else | ||
1351 | { | ||
1352 | sb_strdup (R_cur_l, R_last_ij); | ||
1353 | remove_parentheses (R_cur_l); | ||
1354 | } | ||
1355 | } | ||
1356 | else | ||
1357 | { | ||
1358 | /* we have no left side */ | ||
1359 | R_cur_l->null_flag = GNUNET_YES; | ||
1360 | } | ||
1361 | |||
1362 | /* construct R_cur_r, if not already constructed */ | ||
1363 | if (GNUNET_YES == R_cur_r->null_flag) | ||
1364 | { | ||
1365 | length = R_temp_kk.slen - R_last_ik->slen; | ||
1366 | |||
1367 | /* a(ba)*bx = (ab)+x */ | ||
1368 | if ((length > 0) && (GNUNET_YES != R_last_kk->null_flag) && | ||
1369 | (0 < R_last_kk->slen) && (GNUNET_YES != R_last_kj->null_flag) && | ||
1370 | (0 < R_last_kj->slen) && (GNUNET_YES != R_last_ik->null_flag) && | ||
1371 | (0 < R_last_ik->slen) && | ||
1372 | (0 == sb_strkcmp (&R_temp_kk, R_last_ik, length)) && | ||
1373 | (0 == sb_strncmp (&R_temp_kk, R_last_kj, length))) | ||
1374 | { | ||
1375 | struct StringBuffer temp_a; | ||
1376 | struct StringBuffer temp_b; | ||
1377 | |||
1378 | sb_init (&temp_a, length); | ||
1379 | sb_init (&temp_b, R_last_kj->slen - length); | ||
1380 | |||
1381 | length_l = length; | ||
1382 | temp_a.sbuf = temp_a.abuf; | ||
1383 | GNUNET_memcpy (temp_a.sbuf, R_last_kj->sbuf, length_l); | ||
1384 | temp_a.slen = length_l; | ||
1385 | |||
1386 | length_r = R_last_kj->slen - length; | ||
1387 | temp_b.sbuf = temp_b.abuf; | ||
1388 | GNUNET_memcpy (temp_b.sbuf, &R_last_kj->sbuf[length], length_r); | ||
1389 | temp_b.slen = length_r; | ||
1390 | |||
1391 | /* e|(ab)+ = (ab)* */ | ||
1392 | if ((GNUNET_YES != R_cur_l->null_flag) && (0 == R_cur_l->slen) && | ||
1393 | (0 == temp_b.slen)) | ||
1394 | { | ||
1395 | sb_printf2 (R_cur_r, "(%.*s%.*s)*", 3, R_last_ik, &temp_a); | ||
1396 | sb_free (R_cur_l); | ||
1397 | R_cur_l->null_flag = GNUNET_YES; | ||
1398 | } | ||
1399 | else | ||
1400 | { | ||
1401 | sb_printf3 (R_cur_r, "(%.*s%.*s)+%.*s", 3, R_last_ik, &temp_a, &temp_b); | ||
1402 | } | ||
1403 | sb_free (&temp_a); | ||
1404 | sb_free (&temp_b); | ||
1405 | } | ||
1406 | else if ((0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) && | ||
1407 | (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)) ) | ||
1408 | { | ||
1409 | /* | ||
1410 | * (e|a)a*(e|a) = a* | ||
1411 | * (e|a)(e|a)*(e|a) = a* | ||
1412 | */ | ||
1413 | if (has_epsilon (R_last_ik) && has_epsilon (R_last_kj)) | ||
1414 | { | ||
1415 | if (needs_parentheses (&R_temp_kk)) | ||
1416 | sb_printf1 (R_cur_r, "(%.*s)*", 3, &R_temp_kk); | ||
1417 | else | ||
1418 | sb_printf1 (R_cur_r, "%.*s*", 1, &R_temp_kk); | ||
1419 | } | ||
1420 | /* aa*a = a+a */ | ||
1421 | else if ((0 == clean_ik_kk_cmp) && (0 == clean_kk_kj_cmp) && | ||
1422 | (! has_epsilon (R_last_ik))) | ||
1423 | { | ||
1424 | if (needs_parentheses (&R_temp_kk)) | ||
1425 | sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, &R_temp_kk); | ||
1426 | else | ||
1427 | sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, &R_temp_kk); | ||
1428 | } | ||
1429 | /* | ||
1430 | * (e|a)a*a = a+ | ||
1431 | * aa*(e|a) = a+ | ||
1432 | * a(e|a)*(e|a) = a+ | ||
1433 | * (e|a)a*a = a+ | ||
1434 | */else | ||
1435 | { | ||
1436 | eps_check = (has_epsilon (R_last_ik) + has_epsilon (R_last_kk) | ||
1437 | + has_epsilon (R_last_kj)); | ||
1438 | |||
1439 | if (1 == eps_check) | ||
1440 | { | ||
1441 | if (needs_parentheses (&R_temp_kk)) | ||
1442 | sb_printf1 (R_cur_r, "(%.*s)+", 3, &R_temp_kk); | ||
1443 | else | ||
1444 | sb_printf1 (R_cur_r, "%.*s+", 1, &R_temp_kk); | ||
1445 | } | ||
1446 | } | ||
1447 | } | ||
1448 | /* | ||
1449 | * aa*b = a+b | ||
1450 | * (e|a)(e|a)*b = a*b | ||
1451 | */ | ||
1452 | else if (0 == sb_strcmp (&R_temp_ik, &R_temp_kk)) | ||
1453 | { | ||
1454 | if (has_epsilon (R_last_ik)) | ||
1455 | { | ||
1456 | if (needs_parentheses (&R_temp_kk)) | ||
1457 | sb_printf2 (R_cur_r, "(%.*s)*%.*s", 3, &R_temp_kk, R_last_kj); | ||
1458 | else | ||
1459 | sb_printf2 (R_cur_r, "%.*s*%.*s", 1, &R_temp_kk, R_last_kj); | ||
1460 | } | ||
1461 | else | ||
1462 | { | ||
1463 | if (needs_parentheses (&R_temp_kk)) | ||
1464 | sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, &R_temp_kk, R_last_kj); | ||
1465 | else | ||
1466 | sb_printf2 (R_cur_r, "%.*s+%.*s", 1, &R_temp_kk, R_last_kj); | ||
1467 | } | ||
1468 | } | ||
1469 | /* | ||
1470 | * ba*a = ba+ | ||
1471 | * b(e|a)*(e|a) = ba* | ||
1472 | */ | ||
1473 | else if (0 == sb_strcmp (&R_temp_kk, &R_temp_kj)) | ||
1474 | { | ||
1475 | if (has_epsilon (R_last_kj)) | ||
1476 | { | ||
1477 | if (needs_parentheses (&R_temp_kk)) | ||
1478 | sb_printf2 (R_cur_r, "%.*s(%.*s)*", 3, R_last_ik, &R_temp_kk); | ||
1479 | else | ||
1480 | sb_printf2 (R_cur_r, "%.*s%.*s*", 1, R_last_ik, &R_temp_kk); | ||
1481 | } | ||
1482 | else | ||
1483 | { | ||
1484 | if (needs_parentheses (&R_temp_kk)) | ||
1485 | sb_printf2 (R_cur_r, "(%.*s)+%.*s", 3, R_last_ik, &R_temp_kk); | ||
1486 | else | ||
1487 | sb_printf2 (R_cur_r, "%.*s+%.*s", 1, R_last_ik, &R_temp_kk); | ||
1488 | } | ||
1489 | } | ||
1490 | else | ||
1491 | { | ||
1492 | if (0 < R_temp_kk.slen) | ||
1493 | { | ||
1494 | if (needs_parentheses (&R_temp_kk)) | ||
1495 | { | ||
1496 | sb_printf3 (R_cur_r, | ||
1497 | "%.*s(%.*s)*%.*s", | ||
1498 | 3, | ||
1499 | R_last_ik, | ||
1500 | &R_temp_kk, | ||
1501 | R_last_kj); | ||
1502 | } | ||
1503 | else | ||
1504 | { | ||
1505 | sb_printf3 (R_cur_r, | ||
1506 | "%.*s%.*s*%.*s", | ||
1507 | 1, | ||
1508 | R_last_ik, | ||
1509 | &R_temp_kk, | ||
1510 | R_last_kj); | ||
1511 | } | ||
1512 | } | ||
1513 | else | ||
1514 | { | ||
1515 | sb_printf2 (R_cur_r, "%.*s%.*s", 0, R_last_ik, R_last_kj); | ||
1516 | } | ||
1517 | } | ||
1518 | } | ||
1519 | sb_free (&R_temp_ij); | ||
1520 | sb_free (&R_temp_ik); | ||
1521 | sb_free (&R_temp_kk); | ||
1522 | sb_free (&R_temp_kj); | ||
1523 | |||
1524 | if ((GNUNET_YES == R_cur_l->null_flag) && (GNUNET_YES == R_cur_r->null_flag)) | ||
1525 | { | ||
1526 | R_cur_ij->null_flag = GNUNET_YES; | ||
1527 | return; | ||
1528 | } | ||
1529 | |||
1530 | if ((GNUNET_YES != R_cur_l->null_flag) && (GNUNET_YES == R_cur_r->null_flag)) | ||
1531 | { | ||
1532 | struct StringBuffer tmp; | ||
1533 | |||
1534 | tmp = *R_cur_ij; | ||
1535 | *R_cur_ij = *R_cur_l; | ||
1536 | *R_cur_l = tmp; | ||
1537 | return; | ||
1538 | } | ||
1539 | |||
1540 | if ((GNUNET_YES == R_cur_l->null_flag) && (GNUNET_YES != R_cur_r->null_flag)) | ||
1541 | { | ||
1542 | struct StringBuffer tmp; | ||
1543 | |||
1544 | tmp = *R_cur_ij; | ||
1545 | *R_cur_ij = *R_cur_r; | ||
1546 | *R_cur_r = tmp; | ||
1547 | return; | ||
1548 | } | ||
1549 | |||
1550 | if (0 == sb_nullstrcmp (R_cur_l, R_cur_r)) | ||
1551 | { | ||
1552 | struct StringBuffer tmp; | ||
1553 | |||
1554 | tmp = *R_cur_ij; | ||
1555 | *R_cur_ij = *R_cur_l; | ||
1556 | *R_cur_l = tmp; | ||
1557 | return; | ||
1558 | } | ||
1559 | sb_printf2 (R_cur_ij, "(%.*s|%.*s)", 3, R_cur_l, R_cur_r); | ||
1560 | } | ||
1561 | |||
1562 | |||
1563 | /** | ||
1564 | * Create proofs for all states in the given automaton. Implementation of the | ||
1565 | * algorithm described in chapter 3.2.1 of "Automata Theory, Languages, and | ||
1566 | * Computation 3rd Edition" by Hopcroft, Motwani and Ullman. | ||
1567 | * | ||
1568 | * Each state in the automaton gets assigned 'proof' and 'hash' (hash of the | ||
1569 | * proof) fields. The starting state will only have a valid proof/hash if it has | ||
1570 | * any incoming transitions. | ||
1571 | * | ||
1572 | * @param a automaton for which to assign proofs and hashes, must not be NULL | ||
1573 | */ | ||
1574 | static int | ||
1575 | automaton_create_proofs (struct REGEX_INTERNAL_Automaton *a) | ||
1576 | { | ||
1577 | unsigned int n = a->state_count; | ||
1578 | struct REGEX_INTERNAL_State *states[n]; | ||
1579 | struct StringBuffer *R_last; | ||
1580 | struct StringBuffer *R_cur; | ||
1581 | struct StringBuffer R_cur_r; | ||
1582 | struct StringBuffer R_cur_l; | ||
1583 | struct StringBuffer *R_swap; | ||
1584 | struct REGEX_INTERNAL_Transition *t; | ||
1585 | struct StringBuffer complete_regex; | ||
1586 | unsigned int i; | ||
1587 | unsigned int j; | ||
1588 | unsigned int k; | ||
1589 | |||
1590 | R_last = GNUNET_malloc_large (sizeof(struct StringBuffer) * n * n); | ||
1591 | R_cur = GNUNET_malloc_large (sizeof(struct StringBuffer) * n * n); | ||
1592 | if ((NULL == R_last) || (NULL == R_cur)) | ||
1593 | { | ||
1594 | GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc"); | ||
1595 | GNUNET_free (R_cur); | ||
1596 | GNUNET_free (R_last); | ||
1597 | return GNUNET_SYSERR; | ||
1598 | } | ||
1599 | |||
1600 | /* create depth-first numbering of the states, initializes 'state' */ | ||
1601 | REGEX_INTERNAL_automaton_traverse (a, | ||
1602 | a->start, | ||
1603 | NULL, | ||
1604 | NULL, | ||
1605 | &number_states, | ||
1606 | states); | ||
1607 | |||
1608 | for (i = 0; i < n; i++) | ||
1609 | GNUNET_assert (NULL != states[i]); | ||
1610 | for (i = 0; i < n; i++) | ||
1611 | for (j = 0; j < n; j++) | ||
1612 | R_last[i * n + j].null_flag = GNUNET_YES; | ||
1613 | |||
1614 | /* Compute regular expressions of length "1" between each pair of states */ | ||
1615 | for (i = 0; i < n; i++) | ||
1616 | { | ||
1617 | for (t = states[i]->transitions_head; NULL != t; t = t->next) | ||
1618 | { | ||
1619 | j = t->to_state->dfs_id; | ||
1620 | if (GNUNET_YES == R_last[i * n + j].null_flag) | ||
1621 | { | ||
1622 | sb_strdup_cstr (&R_last[i * n + j], t->label); | ||
1623 | } | ||
1624 | else | ||
1625 | { | ||
1626 | sb_append_cstr (&R_last[i * n + j], "|"); | ||
1627 | sb_append_cstr (&R_last[i * n + j], t->label); | ||
1628 | } | ||
1629 | } | ||
1630 | /* add self-loop: i is reachable from i via epsilon-transition */ | ||
1631 | if (GNUNET_YES == R_last[i * n + i].null_flag) | ||
1632 | { | ||
1633 | R_last[i * n + i].slen = 0; | ||
1634 | R_last[i * n + i].null_flag = GNUNET_NO; | ||
1635 | } | ||
1636 | else | ||
1637 | { | ||
1638 | sb_wrap (&R_last[i * n + i], "(|%.*s)", 3); | ||
1639 | } | ||
1640 | } | ||
1641 | for (i = 0; i < n; i++) | ||
1642 | for (j = 0; j < n; j++) | ||
1643 | if (needs_parentheses (&R_last[i * n + j])) | ||
1644 | sb_wrap (&R_last[i * n + j], "(%.*s)", 2); | ||
1645 | /* Compute regular expressions of length "k" between each pair of states per | ||
1646 | * induction */ | ||
1647 | memset (&R_cur_l, 0, sizeof(struct StringBuffer)); | ||
1648 | memset (&R_cur_r, 0, sizeof(struct StringBuffer)); | ||
1649 | for (k = 0; k < n; k++) | ||
1650 | { | ||
1651 | for (i = 0; i < n; i++) | ||
1652 | { | ||
1653 | for (j = 0; j < n; j++) | ||
1654 | { | ||
1655 | /* Basis for the recursion: | ||
1656 | * $R^{(k)}_{ij} = R^{(k-1)}_{ij} | R^{(k-1)}_{ik} ( R^{(k-1)}_{kk} )^* R^{(k-1)}_{kj} | ||
1657 | * R_last == R^{(k-1)}, R_cur == R^{(k)} | ||
1658 | */ | ||
1659 | |||
1660 | /* Create R_cur[i][j] and simplify the expression */ | ||
1661 | automaton_create_proofs_simplify (&R_last[i * n + j], | ||
1662 | &R_last[i * n + k], | ||
1663 | &R_last[k * n + k], | ||
1664 | &R_last[k * n + j], | ||
1665 | &R_cur[i * n + j], | ||
1666 | &R_cur_l, | ||
1667 | &R_cur_r); | ||
1668 | } | ||
1669 | } | ||
1670 | /* set R_last = R_cur */ | ||
1671 | R_swap = R_last; | ||
1672 | R_last = R_cur; | ||
1673 | R_cur = R_swap; | ||
1674 | /* clear 'R_cur' for next iteration */ | ||
1675 | for (i = 0; i < n; i++) | ||
1676 | for (j = 0; j < n; j++) | ||
1677 | R_cur[i * n + j].null_flag = GNUNET_YES; | ||
1678 | } | ||
1679 | sb_free (&R_cur_l); | ||
1680 | sb_free (&R_cur_r); | ||
1681 | /* assign proofs and hashes */ | ||
1682 | for (i = 0; i < n; i++) | ||
1683 | { | ||
1684 | if (GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag) | ||
1685 | { | ||
1686 | states[i]->proof = GNUNET_strndup (R_last[a->start->dfs_id * n + i].sbuf, | ||
1687 | R_last[a->start->dfs_id * n + i].slen); | ||
1688 | GNUNET_CRYPTO_hash (states[i]->proof, | ||
1689 | strlen (states[i]->proof), | ||
1690 | &states[i]->hash); | ||
1691 | } | ||
1692 | } | ||
1693 | |||
1694 | /* complete regex for whole DFA: union of all pairs (start state/accepting | ||
1695 | * state(s)). */ | ||
1696 | sb_init (&complete_regex, 16 * n); | ||
1697 | for (i = 0; i < n; i++) | ||
1698 | { | ||
1699 | if (states[i]->accepting) | ||
1700 | { | ||
1701 | if ((0 == complete_regex.slen) && | ||
1702 | (0 < R_last[a->start->dfs_id * n + i].slen)) | ||
1703 | { | ||
1704 | sb_append (&complete_regex, &R_last[a->start->dfs_id * n + i]); | ||
1705 | } | ||
1706 | else if ((GNUNET_YES != R_last[a->start->dfs_id * n + i].null_flag) && | ||
1707 | (0 < R_last[a->start->dfs_id * n + i].slen)) | ||
1708 | { | ||
1709 | sb_append_cstr (&complete_regex, "|"); | ||
1710 | sb_append (&complete_regex, &R_last[a->start->dfs_id * n + i]); | ||
1711 | } | ||
1712 | } | ||
1713 | } | ||
1714 | a->canonical_regex = | ||
1715 | GNUNET_strndup (complete_regex.sbuf, complete_regex.slen); | ||
1716 | |||
1717 | /* cleanup */ | ||
1718 | sb_free (&complete_regex); | ||
1719 | for (i = 0; i < n; i++) | ||
1720 | for (j = 0; j < n; j++) | ||
1721 | { | ||
1722 | sb_free (&R_cur[i * n + j]); | ||
1723 | sb_free (&R_last[i * n + j]); | ||
1724 | } | ||
1725 | GNUNET_free (R_cur); | ||
1726 | GNUNET_free (R_last); | ||
1727 | return GNUNET_OK; | ||
1728 | } | ||
1729 | |||
1730 | |||
1731 | /** | ||
1732 | * Creates a new DFA state based on a set of NFA states. Needs to be freed using | ||
1733 | * automaton_destroy_state. | ||
1734 | * | ||
1735 | * @param ctx context | ||
1736 | * @param nfa_states set of NFA states on which the DFA should be based on | ||
1737 | * | ||
1738 | * @return new DFA state | ||
1739 | */ | ||
1740 | static struct REGEX_INTERNAL_State * | ||
1741 | dfa_state_create (struct REGEX_INTERNAL_Context *ctx, | ||
1742 | struct REGEX_INTERNAL_StateSet *nfa_states) | ||
1743 | { | ||
1744 | struct REGEX_INTERNAL_State *s; | ||
1745 | char *pos; | ||
1746 | size_t len; | ||
1747 | struct REGEX_INTERNAL_State *cstate; | ||
1748 | struct REGEX_INTERNAL_Transition *ctran; | ||
1749 | unsigned int i; | ||
1750 | |||
1751 | s = GNUNET_new (struct REGEX_INTERNAL_State); | ||
1752 | s->id = ctx->state_id++; | ||
1753 | s->index = -1; | ||
1754 | s->lowlink = -1; | ||
1755 | |||
1756 | if (NULL == nfa_states) | ||
1757 | { | ||
1758 | GNUNET_asprintf (&s->name, "s%i", s->id); | ||
1759 | return s; | ||
1760 | } | ||
1761 | |||
1762 | s->nfa_set = *nfa_states; | ||
1763 | |||
1764 | if (nfa_states->off < 1) | ||
1765 | return s; | ||
1766 | |||
1767 | /* Create a name based on 'nfa_states' */ | ||
1768 | len = nfa_states->off * 14 + 4; | ||
1769 | s->name = GNUNET_malloc (len); | ||
1770 | strcat (s->name, "{"); | ||
1771 | pos = s->name + 1; | ||
1772 | |||
1773 | for (i = 0; i < nfa_states->off; i++) | ||
1774 | { | ||
1775 | cstate = nfa_states->states[i]; | ||
1776 | GNUNET_snprintf (pos, pos - s->name + len, "%i,", cstate->id); | ||
1777 | pos += strlen (pos); | ||
1778 | |||
1779 | /* Add a transition for each distinct label to NULL state */ | ||
1780 | for (ctran = cstate->transitions_head; NULL != ctran; ctran = ctran->next) | ||
1781 | if (NULL != ctran->label) | ||
1782 | state_add_transition (ctx, s, ctran->label, NULL); | ||
1783 | |||
1784 | /* If the nfa_states contain an accepting state, the new dfa state is also | ||
1785 | * accepting. */ | ||
1786 | if (cstate->accepting) | ||
1787 | s->accepting = 1; | ||
1788 | } | ||
1789 | pos[-1] = '}'; | ||
1790 | s->name = GNUNET_realloc (s->name, strlen (s->name) + 1); | ||
1791 | |||
1792 | memset (nfa_states, 0, sizeof(struct REGEX_INTERNAL_StateSet)); | ||
1793 | return s; | ||
1794 | } | ||
1795 | |||
1796 | |||
1797 | /** | ||
1798 | * Move from the given state 's' to the next state on transition 'str'. Consumes | ||
1799 | * as much of the given 'str' as possible (useful for strided DFAs). On return | ||
1800 | * 's' will point to the next state, and the length of the substring used for | ||
1801 | * this transition will be returned. If no transition possible 0 is returned and | ||
1802 | * 's' points to NULL. | ||
1803 | * | ||
1804 | * @param s starting state, will point to the next state or NULL (if no | ||
1805 | * transition possible) | ||
1806 | * @param str edge label to follow (will match longest common prefix) | ||
1807 | * | ||
1808 | * @return length of the substring consumed from 'str' | ||
1809 | */ | ||
1810 | static unsigned int | ||
1811 | dfa_move (struct REGEX_INTERNAL_State **s, const char *str) | ||
1812 | { | ||
1813 | struct REGEX_INTERNAL_Transition *t; | ||
1814 | struct REGEX_INTERNAL_State *new_s; | ||
1815 | unsigned int len; | ||
1816 | unsigned int max_len; | ||
1817 | |||
1818 | if (NULL == s) | ||
1819 | return 0; | ||
1820 | |||
1821 | new_s = NULL; | ||
1822 | max_len = 0; | ||
1823 | for (t = (*s)->transitions_head; NULL != t; t = t->next) | ||
1824 | { | ||
1825 | len = strlen (t->label); | ||
1826 | |||
1827 | if (0 == strncmp (t->label, str, len)) | ||
1828 | { | ||
1829 | if (len >= max_len) | ||
1830 | { | ||
1831 | max_len = len; | ||
1832 | new_s = t->to_state; | ||
1833 | } | ||
1834 | } | ||
1835 | } | ||
1836 | |||
1837 | *s = new_s; | ||
1838 | return max_len; | ||
1839 | } | ||
1840 | |||
1841 | |||
1842 | /** | ||
1843 | * Set the given state 'marked' to #GNUNET_YES. Used by the | ||
1844 | * #dfa_remove_unreachable_states() function to detect unreachable states in the | ||
1845 | * automaton. | ||
1846 | * | ||
1847 | * @param cls closure, not used. | ||
1848 | * @param count count, not used. | ||
1849 | * @param s state where the marked attribute will be set to #GNUNET_YES. | ||
1850 | */ | ||
1851 | static void | ||
1852 | mark_states (void *cls, | ||
1853 | const unsigned int count, | ||
1854 | struct REGEX_INTERNAL_State *s) | ||
1855 | { | ||
1856 | s->marked = GNUNET_YES; | ||
1857 | } | ||
1858 | |||
1859 | |||
1860 | /** | ||
1861 | * Remove all unreachable states from DFA 'a'. Unreachable states are those | ||
1862 | * states that are not reachable from the starting state. | ||
1863 | * | ||
1864 | * @param a DFA automaton | ||
1865 | */ | ||
1866 | static void | ||
1867 | dfa_remove_unreachable_states (struct REGEX_INTERNAL_Automaton *a) | ||
1868 | { | ||
1869 | struct REGEX_INTERNAL_State *s; | ||
1870 | struct REGEX_INTERNAL_State *s_next; | ||
1871 | |||
1872 | /* 1. unmark all states */ | ||
1873 | for (s = a->states_head; NULL != s; s = s->next) | ||
1874 | s->marked = GNUNET_NO; | ||
1875 | |||
1876 | /* 2. traverse dfa from start state and mark all visited states */ | ||
1877 | REGEX_INTERNAL_automaton_traverse (a, | ||
1878 | a->start, | ||
1879 | NULL, | ||
1880 | NULL, | ||
1881 | &mark_states, | ||
1882 | NULL); | ||
1883 | |||
1884 | /* 3. delete all states that were not visited */ | ||
1885 | for (s = a->states_head; NULL != s; s = s_next) | ||
1886 | { | ||
1887 | s_next = s->next; | ||
1888 | if (GNUNET_NO == s->marked) | ||
1889 | automaton_remove_state (a, s); | ||
1890 | } | ||
1891 | } | ||
1892 | |||
1893 | |||
1894 | /** | ||
1895 | * Remove all dead states from the DFA 'a'. Dead states are those states that do | ||
1896 | * not transition to any other state but themselves. | ||
1897 | * | ||
1898 | * @param a DFA automaton | ||
1899 | */ | ||
1900 | static void | ||
1901 | dfa_remove_dead_states (struct REGEX_INTERNAL_Automaton *a) | ||
1902 | { | ||
1903 | struct REGEX_INTERNAL_State *s; | ||
1904 | struct REGEX_INTERNAL_State *s_next; | ||
1905 | struct REGEX_INTERNAL_Transition *t; | ||
1906 | int dead; | ||
1907 | |||
1908 | GNUNET_assert (DFA == a->type); | ||
1909 | |||
1910 | for (s = a->states_head; NULL != s; s = s_next) | ||
1911 | { | ||
1912 | s_next = s->next; | ||
1913 | |||
1914 | if (s->accepting) | ||
1915 | continue; | ||
1916 | |||
1917 | dead = 1; | ||
1918 | for (t = s->transitions_head; NULL != t; t = t->next) | ||
1919 | { | ||
1920 | if ((NULL != t->to_state) && (t->to_state != s) ) | ||
1921 | { | ||
1922 | dead = 0; | ||
1923 | break; | ||
1924 | } | ||
1925 | } | ||
1926 | |||
1927 | if (0 == dead) | ||
1928 | continue; | ||
1929 | |||
1930 | /* state s is dead, remove it */ | ||
1931 | automaton_remove_state (a, s); | ||
1932 | } | ||
1933 | } | ||
1934 | |||
1935 | |||
1936 | /** | ||
1937 | * Merge all non distinguishable states in the DFA 'a' | ||
1938 | * | ||
1939 | * @param ctx context | ||
1940 | * @param a DFA automaton | ||
1941 | * @return #GNUNET_OK on success | ||
1942 | */ | ||
1943 | static int | ||
1944 | dfa_merge_nondistinguishable_states (struct REGEX_INTERNAL_Context *ctx, | ||
1945 | struct REGEX_INTERNAL_Automaton *a) | ||
1946 | { | ||
1947 | uint32_t *table; | ||
1948 | struct REGEX_INTERNAL_State *s1; | ||
1949 | struct REGEX_INTERNAL_State *s2; | ||
1950 | struct REGEX_INTERNAL_Transition *t1; | ||
1951 | struct REGEX_INTERNAL_Transition *t2; | ||
1952 | struct REGEX_INTERNAL_State *s1_next; | ||
1953 | struct REGEX_INTERNAL_State *s2_next; | ||
1954 | int change; | ||
1955 | unsigned int num_equal_edges; | ||
1956 | unsigned int i; | ||
1957 | unsigned int state_cnt; | ||
1958 | unsigned long long idx; | ||
1959 | unsigned long long idx1; | ||
1960 | |||
1961 | if ((NULL == a) || (0 == a->state_count)) | ||
1962 | { | ||
1963 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
1964 | "Could not merge nondistinguishable states, automaton was NULL.\n"); | ||
1965 | return GNUNET_SYSERR; | ||
1966 | } | ||
1967 | |||
1968 | state_cnt = a->state_count; | ||
1969 | table = GNUNET_malloc_large ( | ||
1970 | (sizeof(uint32_t) * state_cnt * state_cnt / 32) + sizeof(uint32_t)); | ||
1971 | if (NULL == table) | ||
1972 | { | ||
1973 | GNUNET_log_strerror (GNUNET_ERROR_TYPE_ERROR, "malloc"); | ||
1974 | return GNUNET_SYSERR; | ||
1975 | } | ||
1976 | |||
1977 | for (i = 0, s1 = a->states_head; NULL != s1; s1 = s1->next) | ||
1978 | s1->marked = i++; | ||
1979 | |||
1980 | /* Mark all pairs of accepting/!accepting states */ | ||
1981 | for (s1 = a->states_head; NULL != s1; s1 = s1->next) | ||
1982 | for (s2 = a->states_head; NULL != s2; s2 = s2->next) | ||
1983 | if ((s1->accepting && ! s2->accepting) || | ||
1984 | (! s1->accepting && s2->accepting)) | ||
1985 | { | ||
1986 | idx = (unsigned long long) s1->marked * state_cnt + s2->marked; | ||
1987 | table[idx / 32] |= (1U << (idx % 32)); | ||
1988 | } | ||
1989 | |||
1990 | /* Find all equal states */ | ||
1991 | change = 1; | ||
1992 | while (0 != change) | ||
1993 | { | ||
1994 | change = 0; | ||
1995 | for (s1 = a->states_head; NULL != s1; s1 = s1->next) | ||
1996 | { | ||
1997 | for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2->next) | ||
1998 | { | ||
1999 | idx = (unsigned long long) s1->marked * state_cnt + s2->marked; | ||
2000 | if (0 != (table[idx / 32] & (1U << (idx % 32)))) | ||
2001 | continue; | ||
2002 | num_equal_edges = 0; | ||
2003 | for (t1 = s1->transitions_head; NULL != t1; t1 = t1->next) | ||
2004 | { | ||
2005 | for (t2 = s2->transitions_head; NULL != t2; t2 = t2->next) | ||
2006 | { | ||
2007 | if (0 == strcmp (t1->label, t2->label)) | ||
2008 | { | ||
2009 | num_equal_edges++; | ||
2010 | /* same edge, but targets definitively different, so we're different | ||
2011 | as well */ | ||
2012 | if (t1->to_state->marked > t2->to_state->marked) | ||
2013 | idx1 = (unsigned long long) t1->to_state->marked * state_cnt | ||
2014 | + t2->to_state->marked; | ||
2015 | else | ||
2016 | idx1 = (unsigned long long) t2->to_state->marked * state_cnt | ||
2017 | + t1->to_state->marked; | ||
2018 | if (0 != (table[idx1 / 32] & (1U << (idx1 % 32)))) | ||
2019 | { | ||
2020 | table[idx / 32] |= (1U << (idx % 32)); | ||
2021 | change = 1; /* changed a marker, need to run again */ | ||
2022 | } | ||
2023 | } | ||
2024 | } | ||
2025 | } | ||
2026 | if ((num_equal_edges != s1->transition_count) || | ||
2027 | (num_equal_edges != s2->transition_count)) | ||
2028 | { | ||
2029 | /* Make sure ALL edges of possible equal states are the same */ | ||
2030 | table[idx / 32] |= (1U << (idx % 32)); | ||
2031 | change = 1; /* changed a marker, need to run again */ | ||
2032 | } | ||
2033 | } | ||
2034 | } | ||
2035 | } | ||
2036 | |||
2037 | /* Merge states that are equal */ | ||
2038 | for (s1 = a->states_head; NULL != s1; s1 = s1_next) | ||
2039 | { | ||
2040 | s1_next = s1->next; | ||
2041 | for (s2 = a->states_head; NULL != s2 && s1 != s2; s2 = s2_next) | ||
2042 | { | ||
2043 | s2_next = s2->next; | ||
2044 | idx = (unsigned long long) s1->marked * state_cnt + s2->marked; | ||
2045 | if (0 == (table[idx / 32] & (1U << (idx % 32)))) | ||
2046 | automaton_merge_states (ctx, a, s1, s2); | ||
2047 | } | ||
2048 | } | ||
2049 | |||
2050 | GNUNET_free (table); | ||
2051 | return GNUNET_OK; | ||
2052 | } | ||
2053 | |||
2054 | |||
2055 | /** | ||
2056 | * Minimize the given DFA 'a' by removing all unreachable states, removing all | ||
2057 | * dead states and merging all non distinguishable states | ||
2058 | * | ||
2059 | * @param ctx context | ||
2060 | * @param a DFA automaton | ||
2061 | * @return GNUNET_OK on success | ||
2062 | */ | ||
2063 | static int | ||
2064 | dfa_minimize (struct REGEX_INTERNAL_Context *ctx, | ||
2065 | struct REGEX_INTERNAL_Automaton *a) | ||
2066 | { | ||
2067 | if (NULL == a) | ||
2068 | return GNUNET_SYSERR; | ||
2069 | |||
2070 | GNUNET_assert (DFA == a->type); | ||
2071 | |||
2072 | /* 1. remove unreachable states */ | ||
2073 | dfa_remove_unreachable_states (a); | ||
2074 | |||
2075 | /* 2. remove dead states */ | ||
2076 | dfa_remove_dead_states (a); | ||
2077 | |||
2078 | /* 3. Merge nondistinguishable states */ | ||
2079 | if (GNUNET_OK != dfa_merge_nondistinguishable_states (ctx, a)) | ||
2080 | return GNUNET_SYSERR; | ||
2081 | return GNUNET_OK; | ||
2082 | } | ||
2083 | |||
2084 | |||
2085 | /** | ||
2086 | * Context for adding strided transitions to a DFA. | ||
2087 | */ | ||
2088 | struct REGEX_INTERNAL_Strided_Context | ||
2089 | { | ||
2090 | /** | ||
2091 | * Length of the strides. | ||
2092 | */ | ||
2093 | const unsigned int stride; | ||
2094 | |||
2095 | /** | ||
2096 | * Strided transitions DLL. New strided transitions will be stored in this DLL | ||
2097 | * and afterwards added to the DFA. | ||
2098 | */ | ||
2099 | struct REGEX_INTERNAL_Transition *transitions_head; | ||
2100 | |||
2101 | /** | ||
2102 | * Strided transitions DLL. | ||
2103 | */ | ||
2104 | struct REGEX_INTERNAL_Transition *transitions_tail; | ||
2105 | }; | ||
2106 | |||
2107 | |||
2108 | /** | ||
2109 | * Recursive helper function to add strides to a DFA. | ||
2110 | * | ||
2111 | * @param cls context, contains stride length and strided transitions DLL. | ||
2112 | * @param depth current depth of the depth-first traversal of the graph. | ||
2113 | * @param label current label, string that contains all labels on the path from | ||
2114 | * 'start' to 's'. | ||
2115 | * @param start start state for the depth-first traversal of the graph. | ||
2116 | * @param s current state in the depth-first traversal | ||
2117 | */ | ||
2118 | static void | ||
2119 | dfa_add_multi_strides_helper (void *cls, | ||
2120 | const unsigned int depth, | ||
2121 | char *label, | ||
2122 | struct REGEX_INTERNAL_State *start, | ||
2123 | struct REGEX_INTERNAL_State *s) | ||
2124 | { | ||
2125 | struct REGEX_INTERNAL_Strided_Context *ctx = cls; | ||
2126 | struct REGEX_INTERNAL_Transition *t; | ||
2127 | char *new_label; | ||
2128 | |||
2129 | if (depth == ctx->stride) | ||
2130 | { | ||
2131 | t = GNUNET_new (struct REGEX_INTERNAL_Transition); | ||
2132 | t->label = GNUNET_strdup (label); | ||
2133 | t->to_state = s; | ||
2134 | t->from_state = start; | ||
2135 | GNUNET_CONTAINER_DLL_insert (ctx->transitions_head, | ||
2136 | ctx->transitions_tail, | ||
2137 | t); | ||
2138 | } | ||
2139 | else | ||
2140 | { | ||
2141 | for (t = s->transitions_head; NULL != t; t = t->next) | ||
2142 | { | ||
2143 | /* Do not consider self-loops, because it end's up in too many | ||
2144 | * transitions */ | ||
2145 | if (t->to_state == t->from_state) | ||
2146 | continue; | ||
2147 | |||
2148 | if (NULL != label) | ||
2149 | { | ||
2150 | GNUNET_asprintf (&new_label, "%s%s", label, t->label); | ||
2151 | } | ||
2152 | else | ||
2153 | new_label = GNUNET_strdup (t->label); | ||
2154 | |||
2155 | dfa_add_multi_strides_helper (cls, | ||
2156 | (depth + 1), | ||
2157 | new_label, | ||
2158 | start, | ||
2159 | t->to_state); | ||
2160 | } | ||
2161 | } | ||
2162 | GNUNET_free (label); | ||
2163 | } | ||
2164 | |||
2165 | |||
2166 | /** | ||
2167 | * Function called for each state in the DFA. Starts a traversal of depth set in | ||
2168 | * context starting from state 's'. | ||
2169 | * | ||
2170 | * @param cls context. | ||
2171 | * @param count not used. | ||
2172 | * @param s current state. | ||
2173 | */ | ||
2174 | static void | ||
2175 | dfa_add_multi_strides (void *cls, | ||
2176 | const unsigned int count, | ||
2177 | struct REGEX_INTERNAL_State *s) | ||
2178 | { | ||
2179 | dfa_add_multi_strides_helper (cls, 0, NULL, s, s); | ||
2180 | } | ||
2181 | |||
2182 | |||
2183 | /** | ||
2184 | * Adds multi-strided transitions to the given 'dfa'. | ||
2185 | * | ||
2186 | * @param regex_ctx regex context needed to add transitions to the automaton. | ||
2187 | * @param dfa DFA to which the multi strided transitions should be added. | ||
2188 | * @param stride_len length of the strides. | ||
2189 | */ | ||
2190 | void | ||
2191 | REGEX_INTERNAL_dfa_add_multi_strides (struct REGEX_INTERNAL_Context *regex_ctx, | ||
2192 | struct REGEX_INTERNAL_Automaton *dfa, | ||
2193 | const unsigned int stride_len) | ||
2194 | { | ||
2195 | struct REGEX_INTERNAL_Strided_Context ctx = { stride_len, NULL, NULL }; | ||
2196 | struct REGEX_INTERNAL_Transition *t; | ||
2197 | struct REGEX_INTERNAL_Transition *t_next; | ||
2198 | |||
2199 | if ((1 > stride_len) || (GNUNET_YES == dfa->is_multistrided)) | ||
2200 | return; | ||
2201 | |||
2202 | /* Compute the new transitions of given stride_len */ | ||
2203 | REGEX_INTERNAL_automaton_traverse (dfa, | ||
2204 | dfa->start, | ||
2205 | NULL, | ||
2206 | NULL, | ||
2207 | &dfa_add_multi_strides, | ||
2208 | &ctx); | ||
2209 | |||
2210 | /* Add all the new transitions to the automaton. */ | ||
2211 | for (t = ctx.transitions_head; NULL != t; t = t_next) | ||
2212 | { | ||
2213 | t_next = t->next; | ||
2214 | state_add_transition (regex_ctx, t->from_state, t->label, t->to_state); | ||
2215 | GNUNET_CONTAINER_DLL_remove (ctx.transitions_head, ctx.transitions_tail, t); | ||
2216 | GNUNET_free (t->label); | ||
2217 | GNUNET_free (t); | ||
2218 | } | ||
2219 | |||
2220 | /* Mark this automaton as multistrided */ | ||
2221 | dfa->is_multistrided = GNUNET_YES; | ||
2222 | } | ||
2223 | |||
2224 | |||
2225 | /** | ||
2226 | * Recursive Helper function for DFA path compression. Does DFS on the DFA graph | ||
2227 | * and adds new transitions to the given transitions DLL and marks states that | ||
2228 | * should be removed by setting state->contained to GNUNET_YES. | ||
2229 | * | ||
2230 | * @param dfa DFA for which the paths should be compressed. | ||
2231 | * @param start starting state for linear path search. | ||
2232 | * @param cur current state in the recursive DFS. | ||
2233 | * @param label current label (string of traversed labels). | ||
2234 | * @param max_len maximal path compression length. | ||
2235 | * @param transitions_head transitions DLL. | ||
2236 | * @param transitions_tail transitions DLL. | ||
2237 | */ | ||
2238 | void | ||
2239 | dfa_compress_paths_helper (struct REGEX_INTERNAL_Automaton *dfa, | ||
2240 | struct REGEX_INTERNAL_State *start, | ||
2241 | struct REGEX_INTERNAL_State *cur, | ||
2242 | char *label, | ||
2243 | unsigned int max_len, | ||
2244 | struct REGEX_INTERNAL_Transition **transitions_head, | ||
2245 | struct REGEX_INTERNAL_Transition **transitions_tail) | ||
2246 | { | ||
2247 | struct REGEX_INTERNAL_Transition *t; | ||
2248 | char *new_label; | ||
2249 | |||
2250 | |||
2251 | if ((NULL != label) && | ||
2252 | (((cur->incoming_transition_count > 1) || (GNUNET_YES == | ||
2253 | cur->accepting) || | ||
2254 | (GNUNET_YES == cur->marked) ) || | ||
2255 | ((start != dfa->start) && (max_len > 0) && (max_len == strlen ( | ||
2256 | label))) || | ||
2257 | ((start == dfa->start) && (GNUNET_REGEX_INITIAL_BYTES == strlen ( | ||
2258 | label))))) | ||
2259 | { | ||
2260 | t = GNUNET_new (struct REGEX_INTERNAL_Transition); | ||
2261 | t->label = GNUNET_strdup (label); | ||
2262 | t->to_state = cur; | ||
2263 | t->from_state = start; | ||
2264 | GNUNET_CONTAINER_DLL_insert (*transitions_head, *transitions_tail, t); | ||
2265 | |||
2266 | if (GNUNET_NO == cur->marked) | ||
2267 | { | ||
2268 | dfa_compress_paths_helper (dfa, | ||
2269 | cur, | ||
2270 | cur, | ||
2271 | NULL, | ||
2272 | max_len, | ||
2273 | transitions_head, | ||
2274 | transitions_tail); | ||
2275 | } | ||
2276 | return; | ||
2277 | } | ||
2278 | else if (cur != start) | ||
2279 | cur->contained = GNUNET_YES; | ||
2280 | |||
2281 | if ((GNUNET_YES == cur->marked) && (cur != start)) | ||
2282 | return; | ||
2283 | |||
2284 | cur->marked = GNUNET_YES; | ||
2285 | |||
2286 | |||
2287 | for (t = cur->transitions_head; NULL != t; t = t->next) | ||
2288 | { | ||
2289 | if (NULL != label) | ||
2290 | GNUNET_asprintf (&new_label, "%s%s", label, t->label); | ||
2291 | else | ||
2292 | new_label = GNUNET_strdup (t->label); | ||
2293 | |||
2294 | if (t->to_state != cur) | ||
2295 | { | ||
2296 | dfa_compress_paths_helper (dfa, | ||
2297 | start, | ||
2298 | t->to_state, | ||
2299 | new_label, | ||
2300 | max_len, | ||
2301 | transitions_head, | ||
2302 | transitions_tail); | ||
2303 | } | ||
2304 | GNUNET_free (new_label); | ||
2305 | } | ||
2306 | } | ||
2307 | |||
2308 | |||
2309 | /** | ||
2310 | * Compress paths in the given 'dfa'. Linear paths like 0->1->2->3 will be | ||
2311 | * compressed to 0->3 by combining transitions. | ||
2312 | * | ||
2313 | * @param regex_ctx context for adding new transitions. | ||
2314 | * @param dfa DFA representation, will directly modify the given DFA. | ||
2315 | * @param max_len maximal length of the compressed paths. | ||
2316 | */ | ||
2317 | static void | ||
2318 | dfa_compress_paths (struct REGEX_INTERNAL_Context *regex_ctx, | ||
2319 | struct REGEX_INTERNAL_Automaton *dfa, | ||
2320 | unsigned int max_len) | ||
2321 | { | ||
2322 | struct REGEX_INTERNAL_State *s; | ||
2323 | struct REGEX_INTERNAL_State *s_next; | ||
2324 | struct REGEX_INTERNAL_Transition *t; | ||
2325 | struct REGEX_INTERNAL_Transition *t_next; | ||
2326 | struct REGEX_INTERNAL_Transition *transitions_head = NULL; | ||
2327 | struct REGEX_INTERNAL_Transition *transitions_tail = NULL; | ||
2328 | |||
2329 | if (NULL == dfa) | ||
2330 | return; | ||
2331 | |||
2332 | /* Count the incoming transitions on each state. */ | ||
2333 | for (s = dfa->states_head; NULL != s; s = s->next) | ||
2334 | { | ||
2335 | for (t = s->transitions_head; NULL != t; t = t->next) | ||
2336 | { | ||
2337 | if (NULL != t->to_state) | ||
2338 | t->to_state->incoming_transition_count++; | ||
2339 | } | ||
2340 | } | ||
2341 | |||
2342 | /* Unmark all states. */ | ||
2343 | for (s = dfa->states_head; NULL != s; s = s->next) | ||
2344 | { | ||
2345 | s->marked = GNUNET_NO; | ||
2346 | s->contained = GNUNET_NO; | ||
2347 | } | ||
2348 | |||
2349 | /* Add strides and mark states that can be deleted. */ | ||
2350 | dfa_compress_paths_helper (dfa, | ||
2351 | dfa->start, | ||
2352 | dfa->start, | ||
2353 | NULL, | ||
2354 | max_len, | ||
2355 | &transitions_head, | ||
2356 | &transitions_tail); | ||
2357 | |||
2358 | /* Add all the new transitions to the automaton. */ | ||
2359 | for (t = transitions_head; NULL != t; t = t_next) | ||
2360 | { | ||
2361 | t_next = t->next; | ||
2362 | state_add_transition (regex_ctx, t->from_state, t->label, t->to_state); | ||
2363 | GNUNET_CONTAINER_DLL_remove (transitions_head, transitions_tail, t); | ||
2364 | GNUNET_free (t->label); | ||
2365 | GNUNET_free (t); | ||
2366 | } | ||
2367 | |||
2368 | /* Remove marked states (including their incoming and outgoing transitions). */ | ||
2369 | for (s = dfa->states_head; NULL != s; s = s_next) | ||
2370 | { | ||
2371 | s_next = s->next; | ||
2372 | if (GNUNET_YES == s->contained) | ||
2373 | automaton_remove_state (dfa, s); | ||
2374 | } | ||
2375 | } | ||
2376 | |||
2377 | |||
2378 | /** | ||
2379 | * Creates a new NFA fragment. Needs to be cleared using | ||
2380 | * automaton_fragment_clear. | ||
2381 | * | ||
2382 | * @param start starting state | ||
2383 | * @param end end state | ||
2384 | * | ||
2385 | * @return new NFA fragment | ||
2386 | */ | ||
2387 | static struct REGEX_INTERNAL_Automaton * | ||
2388 | nfa_fragment_create (struct REGEX_INTERNAL_State *start, | ||
2389 | struct REGEX_INTERNAL_State *end) | ||
2390 | { | ||
2391 | struct REGEX_INTERNAL_Automaton *n; | ||
2392 | |||
2393 | n = GNUNET_new (struct REGEX_INTERNAL_Automaton); | ||
2394 | |||
2395 | n->type = NFA; | ||
2396 | n->start = NULL; | ||
2397 | n->end = NULL; | ||
2398 | n->state_count = 0; | ||
2399 | |||
2400 | if ((NULL == start) || (NULL == end)) | ||
2401 | return n; | ||
2402 | |||
2403 | automaton_add_state (n, end); | ||
2404 | automaton_add_state (n, start); | ||
2405 | |||
2406 | n->state_count = 2; | ||
2407 | |||
2408 | n->start = start; | ||
2409 | n->end = end; | ||
2410 | |||
2411 | return n; | ||
2412 | } | ||
2413 | |||
2414 | |||
2415 | /** | ||
2416 | * Adds a list of states to the given automaton 'n'. | ||
2417 | * | ||
2418 | * @param n automaton to which the states should be added | ||
2419 | * @param states_head head of the DLL of states | ||
2420 | * @param states_tail tail of the DLL of states | ||
2421 | */ | ||
2422 | static void | ||
2423 | nfa_add_states (struct REGEX_INTERNAL_Automaton *n, | ||
2424 | struct REGEX_INTERNAL_State *states_head, | ||
2425 | struct REGEX_INTERNAL_State *states_tail) | ||
2426 | { | ||
2427 | struct REGEX_INTERNAL_State *s; | ||
2428 | |||
2429 | if ((NULL == n) || (NULL == states_head)) | ||
2430 | { | ||
2431 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not add states\n"); | ||
2432 | return; | ||
2433 | } | ||
2434 | |||
2435 | if (NULL == n->states_head) | ||
2436 | { | ||
2437 | n->states_head = states_head; | ||
2438 | n->states_tail = states_tail; | ||
2439 | return; | ||
2440 | } | ||
2441 | |||
2442 | if (NULL != states_head) | ||
2443 | { | ||
2444 | n->states_tail->next = states_head; | ||
2445 | n->states_tail = states_tail; | ||
2446 | } | ||
2447 | |||
2448 | for (s = states_head; NULL != s; s = s->next) | ||
2449 | n->state_count++; | ||
2450 | } | ||
2451 | |||
2452 | |||
2453 | /** | ||
2454 | * Creates a new NFA state. Needs to be freed using automaton_destroy_state. | ||
2455 | * | ||
2456 | * @param ctx context | ||
2457 | * @param accepting is it an accepting state or not | ||
2458 | * | ||
2459 | * @return new NFA state | ||
2460 | */ | ||
2461 | static struct REGEX_INTERNAL_State * | ||
2462 | nfa_state_create (struct REGEX_INTERNAL_Context *ctx, int accepting) | ||
2463 | { | ||
2464 | struct REGEX_INTERNAL_State *s; | ||
2465 | |||
2466 | s = GNUNET_new (struct REGEX_INTERNAL_State); | ||
2467 | s->id = ctx->state_id++; | ||
2468 | s->accepting = accepting; | ||
2469 | s->marked = GNUNET_NO; | ||
2470 | s->contained = 0; | ||
2471 | s->index = -1; | ||
2472 | s->lowlink = -1; | ||
2473 | s->scc_id = 0; | ||
2474 | s->name = NULL; | ||
2475 | GNUNET_asprintf (&s->name, "s%i", s->id); | ||
2476 | |||
2477 | return s; | ||
2478 | } | ||
2479 | |||
2480 | |||
2481 | /** | ||
2482 | * Calculates the closure set for the given set of states. | ||
2483 | * | ||
2484 | * @param ret set to sorted nfa closure on 'label' (epsilon closure if 'label' is NULL) | ||
2485 | * @param nfa the NFA containing 's' | ||
2486 | * @param states list of states on which to base the closure on | ||
2487 | * @param label transitioning label for which to base the closure on, | ||
2488 | * pass NULL for epsilon transition | ||
2489 | */ | ||
2490 | static void | ||
2491 | nfa_closure_set_create (struct REGEX_INTERNAL_StateSet *ret, | ||
2492 | struct REGEX_INTERNAL_Automaton *nfa, | ||
2493 | struct REGEX_INTERNAL_StateSet *states, | ||
2494 | const char *label) | ||
2495 | { | ||
2496 | struct REGEX_INTERNAL_State *s; | ||
2497 | unsigned int i; | ||
2498 | struct REGEX_INTERNAL_StateSet_MDLL cls_stack; | ||
2499 | struct REGEX_INTERNAL_State *clsstate; | ||
2500 | struct REGEX_INTERNAL_State *currentstate; | ||
2501 | struct REGEX_INTERNAL_Transition *ctran; | ||
2502 | |||
2503 | memset (ret, 0, sizeof(struct REGEX_INTERNAL_StateSet)); | ||
2504 | if (NULL == states) | ||
2505 | return; | ||
2506 | |||
2507 | for (i = 0; i < states->off; i++) | ||
2508 | { | ||
2509 | s = states->states[i]; | ||
2510 | |||
2511 | /* Add start state to closure only for epsilon closure */ | ||
2512 | if (NULL == label) | ||
2513 | state_set_append (ret, s); | ||
2514 | |||
2515 | /* initialize work stack */ | ||
2516 | cls_stack.head = NULL; | ||
2517 | cls_stack.tail = NULL; | ||
2518 | GNUNET_CONTAINER_MDLL_insert (ST, cls_stack.head, cls_stack.tail, s); | ||
2519 | cls_stack.len = 1; | ||
2520 | |||
2521 | while (NULL != (currentstate = cls_stack.tail)) | ||
2522 | { | ||
2523 | GNUNET_CONTAINER_MDLL_remove (ST, | ||
2524 | cls_stack.head, | ||
2525 | cls_stack.tail, | ||
2526 | currentstate); | ||
2527 | cls_stack.len--; | ||
2528 | for (ctran = currentstate->transitions_head; NULL != ctran; | ||
2529 | ctran = ctran->next) | ||
2530 | { | ||
2531 | if (NULL == (clsstate = ctran->to_state)) | ||
2532 | continue; | ||
2533 | if (0 != clsstate->contained) | ||
2534 | continue; | ||
2535 | if (0 != nullstrcmp (label, ctran->label)) | ||
2536 | continue; | ||
2537 | state_set_append (ret, clsstate); | ||
2538 | GNUNET_CONTAINER_MDLL_insert_tail (ST, | ||
2539 | cls_stack.head, | ||
2540 | cls_stack.tail, | ||
2541 | clsstate); | ||
2542 | cls_stack.len++; | ||
2543 | clsstate->contained = 1; | ||
2544 | } | ||
2545 | } | ||
2546 | } | ||
2547 | for (i = 0; i < ret->off; i++) | ||
2548 | ret->states[i]->contained = 0; | ||
2549 | |||
2550 | if (ret->off > 1) | ||
2551 | qsort (ret->states, | ||
2552 | ret->off, | ||
2553 | sizeof(struct REGEX_INTERNAL_State *), | ||
2554 | &state_compare); | ||
2555 | } | ||
2556 | |||
2557 | |||
2558 | /** | ||
2559 | * Pops two NFA fragments (a, b) from the stack and concatenates them (ab) | ||
2560 | * | ||
2561 | * @param ctx context | ||
2562 | */ | ||
2563 | static void | ||
2564 | nfa_add_concatenation (struct REGEX_INTERNAL_Context *ctx) | ||
2565 | { | ||
2566 | struct REGEX_INTERNAL_Automaton *a; | ||
2567 | struct REGEX_INTERNAL_Automaton *b; | ||
2568 | struct REGEX_INTERNAL_Automaton *new_nfa; | ||
2569 | |||
2570 | b = ctx->stack_tail; | ||
2571 | GNUNET_assert (NULL != b); | ||
2572 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b); | ||
2573 | a = ctx->stack_tail; | ||
2574 | GNUNET_assert (NULL != a); | ||
2575 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a); | ||
2576 | |||
2577 | state_add_transition (ctx, a->end, NULL, b->start); | ||
2578 | a->end->accepting = 0; | ||
2579 | b->end->accepting = 1; | ||
2580 | |||
2581 | new_nfa = nfa_fragment_create (NULL, NULL); | ||
2582 | nfa_add_states (new_nfa, a->states_head, a->states_tail); | ||
2583 | nfa_add_states (new_nfa, b->states_head, b->states_tail); | ||
2584 | new_nfa->start = a->start; | ||
2585 | new_nfa->end = b->end; | ||
2586 | new_nfa->state_count += a->state_count + b->state_count; | ||
2587 | automaton_fragment_clear (a); | ||
2588 | automaton_fragment_clear (b); | ||
2589 | |||
2590 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa); | ||
2591 | } | ||
2592 | |||
2593 | |||
2594 | /** | ||
2595 | * Pops a NFA fragment from the stack (a) and adds a new fragment (a*) | ||
2596 | * | ||
2597 | * @param ctx context | ||
2598 | */ | ||
2599 | static void | ||
2600 | nfa_add_star_op (struct REGEX_INTERNAL_Context *ctx) | ||
2601 | { | ||
2602 | struct REGEX_INTERNAL_Automaton *a; | ||
2603 | struct REGEX_INTERNAL_Automaton *new_nfa; | ||
2604 | struct REGEX_INTERNAL_State *start; | ||
2605 | struct REGEX_INTERNAL_State *end; | ||
2606 | |||
2607 | a = ctx->stack_tail; | ||
2608 | |||
2609 | if (NULL == a) | ||
2610 | { | ||
2611 | GNUNET_log ( | ||
2612 | GNUNET_ERROR_TYPE_ERROR, | ||
2613 | "nfa_add_star_op failed, because there was no element on the stack"); | ||
2614 | return; | ||
2615 | } | ||
2616 | |||
2617 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a); | ||
2618 | |||
2619 | start = nfa_state_create (ctx, 0); | ||
2620 | end = nfa_state_create (ctx, 1); | ||
2621 | |||
2622 | state_add_transition (ctx, start, NULL, a->start); | ||
2623 | state_add_transition (ctx, start, NULL, end); | ||
2624 | state_add_transition (ctx, a->end, NULL, a->start); | ||
2625 | state_add_transition (ctx, a->end, NULL, end); | ||
2626 | |||
2627 | a->end->accepting = 0; | ||
2628 | end->accepting = 1; | ||
2629 | |||
2630 | new_nfa = nfa_fragment_create (start, end); | ||
2631 | nfa_add_states (new_nfa, a->states_head, a->states_tail); | ||
2632 | automaton_fragment_clear (a); | ||
2633 | |||
2634 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa); | ||
2635 | } | ||
2636 | |||
2637 | |||
2638 | /** | ||
2639 | * Pops an NFA fragment (a) from the stack and adds a new fragment (a+) | ||
2640 | * | ||
2641 | * @param ctx context | ||
2642 | */ | ||
2643 | static void | ||
2644 | nfa_add_plus_op (struct REGEX_INTERNAL_Context *ctx) | ||
2645 | { | ||
2646 | struct REGEX_INTERNAL_Automaton *a; | ||
2647 | |||
2648 | a = ctx->stack_tail; | ||
2649 | |||
2650 | if (NULL == a) | ||
2651 | { | ||
2652 | GNUNET_log ( | ||
2653 | GNUNET_ERROR_TYPE_ERROR, | ||
2654 | "nfa_add_plus_op failed, because there was no element on the stack"); | ||
2655 | return; | ||
2656 | } | ||
2657 | |||
2658 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a); | ||
2659 | |||
2660 | state_add_transition (ctx, a->end, NULL, a->start); | ||
2661 | |||
2662 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, a); | ||
2663 | } | ||
2664 | |||
2665 | |||
2666 | /** | ||
2667 | * Pops an NFA fragment (a) from the stack and adds a new fragment (a?) | ||
2668 | * | ||
2669 | * @param ctx context | ||
2670 | */ | ||
2671 | static void | ||
2672 | nfa_add_question_op (struct REGEX_INTERNAL_Context *ctx) | ||
2673 | { | ||
2674 | struct REGEX_INTERNAL_Automaton *a; | ||
2675 | struct REGEX_INTERNAL_Automaton *new_nfa; | ||
2676 | struct REGEX_INTERNAL_State *start; | ||
2677 | struct REGEX_INTERNAL_State *end; | ||
2678 | |||
2679 | a = ctx->stack_tail; | ||
2680 | if (NULL == a) | ||
2681 | { | ||
2682 | GNUNET_log ( | ||
2683 | GNUNET_ERROR_TYPE_ERROR, | ||
2684 | "nfa_add_question_op failed, because there was no element on the stack"); | ||
2685 | return; | ||
2686 | } | ||
2687 | |||
2688 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a); | ||
2689 | |||
2690 | start = nfa_state_create (ctx, 0); | ||
2691 | end = nfa_state_create (ctx, 1); | ||
2692 | |||
2693 | state_add_transition (ctx, start, NULL, a->start); | ||
2694 | state_add_transition (ctx, start, NULL, end); | ||
2695 | state_add_transition (ctx, a->end, NULL, end); | ||
2696 | |||
2697 | a->end->accepting = 0; | ||
2698 | |||
2699 | new_nfa = nfa_fragment_create (start, end); | ||
2700 | nfa_add_states (new_nfa, a->states_head, a->states_tail); | ||
2701 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa); | ||
2702 | automaton_fragment_clear (a); | ||
2703 | } | ||
2704 | |||
2705 | |||
2706 | /** | ||
2707 | * Pops two NFA fragments (a, b) from the stack and adds a new NFA fragment that | ||
2708 | * alternates between a and b (a|b) | ||
2709 | * | ||
2710 | * @param ctx context | ||
2711 | */ | ||
2712 | static void | ||
2713 | nfa_add_alternation (struct REGEX_INTERNAL_Context *ctx) | ||
2714 | { | ||
2715 | struct REGEX_INTERNAL_Automaton *a; | ||
2716 | struct REGEX_INTERNAL_Automaton *b; | ||
2717 | struct REGEX_INTERNAL_Automaton *new_nfa; | ||
2718 | struct REGEX_INTERNAL_State *start; | ||
2719 | struct REGEX_INTERNAL_State *end; | ||
2720 | |||
2721 | b = ctx->stack_tail; | ||
2722 | GNUNET_assert (NULL != b); | ||
2723 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, b); | ||
2724 | a = ctx->stack_tail; | ||
2725 | GNUNET_assert (NULL != a); | ||
2726 | GNUNET_CONTAINER_DLL_remove (ctx->stack_head, ctx->stack_tail, a); | ||
2727 | |||
2728 | start = nfa_state_create (ctx, 0); | ||
2729 | end = nfa_state_create (ctx, 1); | ||
2730 | state_add_transition (ctx, start, NULL, a->start); | ||
2731 | state_add_transition (ctx, start, NULL, b->start); | ||
2732 | |||
2733 | state_add_transition (ctx, a->end, NULL, end); | ||
2734 | state_add_transition (ctx, b->end, NULL, end); | ||
2735 | |||
2736 | a->end->accepting = 0; | ||
2737 | b->end->accepting = 0; | ||
2738 | end->accepting = 1; | ||
2739 | |||
2740 | new_nfa = nfa_fragment_create (start, end); | ||
2741 | nfa_add_states (new_nfa, a->states_head, a->states_tail); | ||
2742 | nfa_add_states (new_nfa, b->states_head, b->states_tail); | ||
2743 | automaton_fragment_clear (a); | ||
2744 | automaton_fragment_clear (b); | ||
2745 | |||
2746 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, new_nfa); | ||
2747 | } | ||
2748 | |||
2749 | |||
2750 | /** | ||
2751 | * Adds a new nfa fragment to the stack | ||
2752 | * | ||
2753 | * @param ctx context | ||
2754 | * @param label label for nfa transition | ||
2755 | */ | ||
2756 | static void | ||
2757 | nfa_add_label (struct REGEX_INTERNAL_Context *ctx, const char *label) | ||
2758 | { | ||
2759 | struct REGEX_INTERNAL_Automaton *n; | ||
2760 | struct REGEX_INTERNAL_State *start; | ||
2761 | struct REGEX_INTERNAL_State *end; | ||
2762 | |||
2763 | GNUNET_assert (NULL != ctx); | ||
2764 | |||
2765 | start = nfa_state_create (ctx, 0); | ||
2766 | end = nfa_state_create (ctx, 1); | ||
2767 | state_add_transition (ctx, start, label, end); | ||
2768 | n = nfa_fragment_create (start, end); | ||
2769 | GNUNET_assert (NULL != n); | ||
2770 | GNUNET_CONTAINER_DLL_insert_tail (ctx->stack_head, ctx->stack_tail, n); | ||
2771 | } | ||
2772 | |||
2773 | |||
2774 | /** | ||
2775 | * Initialize a new context | ||
2776 | * | ||
2777 | * @param ctx context | ||
2778 | */ | ||
2779 | static void | ||
2780 | REGEX_INTERNAL_context_init (struct REGEX_INTERNAL_Context *ctx) | ||
2781 | { | ||
2782 | if (NULL == ctx) | ||
2783 | { | ||
2784 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Context was NULL!"); | ||
2785 | return; | ||
2786 | } | ||
2787 | ctx->state_id = 0; | ||
2788 | ctx->transition_id = 0; | ||
2789 | ctx->stack_head = NULL; | ||
2790 | ctx->stack_tail = NULL; | ||
2791 | } | ||
2792 | |||
2793 | |||
2794 | /** | ||
2795 | * Construct an NFA by parsing the regex string of length 'len'. | ||
2796 | * | ||
2797 | * @param regex regular expression string | ||
2798 | * @param len length of the string | ||
2799 | * | ||
2800 | * @return NFA, needs to be freed using REGEX_INTERNAL_destroy_automaton | ||
2801 | */ | ||
2802 | struct REGEX_INTERNAL_Automaton * | ||
2803 | REGEX_INTERNAL_construct_nfa (const char *regex, const size_t len) | ||
2804 | { | ||
2805 | struct REGEX_INTERNAL_Context ctx; | ||
2806 | struct REGEX_INTERNAL_Automaton *nfa; | ||
2807 | const char *regexp; | ||
2808 | char curlabel[2]; | ||
2809 | char *error_msg; | ||
2810 | unsigned int count; | ||
2811 | unsigned int altcount; | ||
2812 | unsigned int atomcount; | ||
2813 | unsigned int poff; | ||
2814 | unsigned int psize; | ||
2815 | |||
2816 | struct | ||
2817 | { | ||
2818 | int altcount; | ||
2819 | int atomcount; | ||
2820 | } *p; | ||
2821 | |||
2822 | if ((NULL == regex) || (0 == strlen (regex)) || (0 == len)) | ||
2823 | { | ||
2824 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
2825 | "Could not parse regex. Empty regex string provided.\n"); | ||
2826 | |||
2827 | return NULL; | ||
2828 | } | ||
2829 | REGEX_INTERNAL_context_init (&ctx); | ||
2830 | |||
2831 | regexp = regex; | ||
2832 | curlabel[1] = '\0'; | ||
2833 | p = NULL; | ||
2834 | error_msg = NULL; | ||
2835 | altcount = 0; | ||
2836 | atomcount = 0; | ||
2837 | poff = 0; | ||
2838 | psize = 0; | ||
2839 | |||
2840 | for (count = 0; count < len && *regexp; count++, regexp++) | ||
2841 | { | ||
2842 | switch (*regexp) | ||
2843 | { | ||
2844 | case '(': | ||
2845 | if (atomcount > 1) | ||
2846 | { | ||
2847 | --atomcount; | ||
2848 | nfa_add_concatenation (&ctx); | ||
2849 | } | ||
2850 | if (poff == psize) | ||
2851 | GNUNET_array_grow (p, psize, psize * 2 + 4); /* FIXME why *2 +4? */ | ||
2852 | p[poff].altcount = altcount; | ||
2853 | p[poff].atomcount = atomcount; | ||
2854 | poff++; | ||
2855 | altcount = 0; | ||
2856 | atomcount = 0; | ||
2857 | break; | ||
2858 | |||
2859 | case '|': | ||
2860 | if (0 == atomcount) | ||
2861 | { | ||
2862 | error_msg = "Cannot append '|' to nothing"; | ||
2863 | goto error; | ||
2864 | } | ||
2865 | while (--atomcount > 0) | ||
2866 | nfa_add_concatenation (&ctx); | ||
2867 | altcount++; | ||
2868 | break; | ||
2869 | |||
2870 | case ')': | ||
2871 | if (0 == poff) | ||
2872 | { | ||
2873 | error_msg = "Missing opening '('"; | ||
2874 | goto error; | ||
2875 | } | ||
2876 | if (0 == atomcount) | ||
2877 | { | ||
2878 | /* Ignore this: "()" */ | ||
2879 | poff--; | ||
2880 | altcount = p[poff].altcount; | ||
2881 | atomcount = p[poff].atomcount; | ||
2882 | break; | ||
2883 | } | ||
2884 | while (--atomcount > 0) | ||
2885 | nfa_add_concatenation (&ctx); | ||
2886 | for (; altcount > 0; altcount--) | ||
2887 | nfa_add_alternation (&ctx); | ||
2888 | poff--; | ||
2889 | altcount = p[poff].altcount; | ||
2890 | atomcount = p[poff].atomcount; | ||
2891 | atomcount++; | ||
2892 | break; | ||
2893 | |||
2894 | case '*': | ||
2895 | if (atomcount == 0) | ||
2896 | { | ||
2897 | error_msg = "Cannot append '*' to nothing"; | ||
2898 | goto error; | ||
2899 | } | ||
2900 | nfa_add_star_op (&ctx); | ||
2901 | break; | ||
2902 | |||
2903 | case '+': | ||
2904 | if (atomcount == 0) | ||
2905 | { | ||
2906 | error_msg = "Cannot append '+' to nothing"; | ||
2907 | goto error; | ||
2908 | } | ||
2909 | nfa_add_plus_op (&ctx); | ||
2910 | break; | ||
2911 | |||
2912 | case '?': | ||
2913 | if (atomcount == 0) | ||
2914 | { | ||
2915 | error_msg = "Cannot append '?' to nothing"; | ||
2916 | goto error; | ||
2917 | } | ||
2918 | nfa_add_question_op (&ctx); | ||
2919 | break; | ||
2920 | |||
2921 | default: | ||
2922 | if (atomcount > 1) | ||
2923 | { | ||
2924 | --atomcount; | ||
2925 | nfa_add_concatenation (&ctx); | ||
2926 | } | ||
2927 | curlabel[0] = *regexp; | ||
2928 | nfa_add_label (&ctx, curlabel); | ||
2929 | atomcount++; | ||
2930 | break; | ||
2931 | } | ||
2932 | } | ||
2933 | if (0 != poff) | ||
2934 | { | ||
2935 | error_msg = "Unbalanced parenthesis"; | ||
2936 | goto error; | ||
2937 | } | ||
2938 | while (--atomcount > 0) | ||
2939 | nfa_add_concatenation (&ctx); | ||
2940 | for (; altcount > 0; altcount--) | ||
2941 | nfa_add_alternation (&ctx); | ||
2942 | |||
2943 | GNUNET_array_grow (p, psize, 0); | ||
2944 | |||
2945 | nfa = ctx.stack_tail; | ||
2946 | GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa); | ||
2947 | |||
2948 | if (NULL != ctx.stack_head) | ||
2949 | { | ||
2950 | error_msg = "Creating the NFA failed. NFA stack was not empty!"; | ||
2951 | goto error; | ||
2952 | } | ||
2953 | |||
2954 | /* Remember the regex that was used to generate this NFA */ | ||
2955 | nfa->regex = GNUNET_strdup (regex); | ||
2956 | |||
2957 | /* create depth-first numbering of the states for pretty printing */ | ||
2958 | REGEX_INTERNAL_automaton_traverse (nfa, | ||
2959 | NULL, | ||
2960 | NULL, | ||
2961 | NULL, | ||
2962 | &number_states, | ||
2963 | NULL); | ||
2964 | |||
2965 | /* No multistriding added so far */ | ||
2966 | nfa->is_multistrided = GNUNET_NO; | ||
2967 | |||
2968 | return nfa; | ||
2969 | |||
2970 | error: | ||
2971 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Could not parse regex: `%s'\n", regex); | ||
2972 | if (NULL != error_msg) | ||
2973 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "%s\n", error_msg); | ||
2974 | |||
2975 | GNUNET_free (p); | ||
2976 | |||
2977 | while (NULL != (nfa = ctx.stack_head)) | ||
2978 | { | ||
2979 | GNUNET_CONTAINER_DLL_remove (ctx.stack_head, ctx.stack_tail, nfa); | ||
2980 | REGEX_INTERNAL_automaton_destroy (nfa); | ||
2981 | } | ||
2982 | |||
2983 | return NULL; | ||
2984 | } | ||
2985 | |||
2986 | |||
2987 | /** | ||
2988 | * Create DFA states based on given 'nfa' and starting with 'dfa_state'. | ||
2989 | * | ||
2990 | * @param ctx context. | ||
2991 | * @param nfa NFA automaton. | ||
2992 | * @param dfa DFA automaton. | ||
2993 | * @param dfa_state current dfa state, pass epsilon closure of first nfa state | ||
2994 | * for starting. | ||
2995 | */ | ||
2996 | static void | ||
2997 | construct_dfa_states (struct REGEX_INTERNAL_Context *ctx, | ||
2998 | struct REGEX_INTERNAL_Automaton *nfa, | ||
2999 | struct REGEX_INTERNAL_Automaton *dfa, | ||
3000 | struct REGEX_INTERNAL_State *dfa_state) | ||
3001 | { | ||
3002 | struct REGEX_INTERNAL_Transition *ctran; | ||
3003 | struct REGEX_INTERNAL_State *new_dfa_state; | ||
3004 | struct REGEX_INTERNAL_State *state_contains; | ||
3005 | struct REGEX_INTERNAL_State *state_iter; | ||
3006 | struct REGEX_INTERNAL_StateSet tmp; | ||
3007 | struct REGEX_INTERNAL_StateSet nfa_set; | ||
3008 | |||
3009 | for (ctran = dfa_state->transitions_head; NULL != ctran; ctran = ctran->next) | ||
3010 | { | ||
3011 | if ((NULL == ctran->label) || (NULL != ctran->to_state) ) | ||
3012 | continue; | ||
3013 | |||
3014 | nfa_closure_set_create (&tmp, nfa, &dfa_state->nfa_set, ctran->label); | ||
3015 | nfa_closure_set_create (&nfa_set, nfa, &tmp, NULL); | ||
3016 | state_set_clear (&tmp); | ||
3017 | |||
3018 | state_contains = NULL; | ||
3019 | for (state_iter = dfa->states_head; NULL != state_iter; | ||
3020 | state_iter = state_iter->next) | ||
3021 | { | ||
3022 | if (0 == state_set_compare (&state_iter->nfa_set, &nfa_set)) | ||
3023 | { | ||
3024 | state_contains = state_iter; | ||
3025 | break; | ||
3026 | } | ||
3027 | } | ||
3028 | if (NULL == state_contains) | ||
3029 | { | ||
3030 | new_dfa_state = dfa_state_create (ctx, &nfa_set); | ||
3031 | automaton_add_state (dfa, new_dfa_state); | ||
3032 | ctran->to_state = new_dfa_state; | ||
3033 | construct_dfa_states (ctx, nfa, dfa, new_dfa_state); | ||
3034 | } | ||
3035 | else | ||
3036 | { | ||
3037 | ctran->to_state = state_contains; | ||
3038 | state_set_clear (&nfa_set); | ||
3039 | } | ||
3040 | } | ||
3041 | } | ||
3042 | |||
3043 | |||
3044 | /** | ||
3045 | * Construct DFA for the given 'regex' of length 'len'. | ||
3046 | * | ||
3047 | * Path compression means, that for example a DFA o -> a -> b -> c -> o will be | ||
3048 | * compressed to o -> abc -> o. Note that this parameter influences the | ||
3049 | * non-determinism of states of the resulting NFA in the DHT (number of outgoing | ||
3050 | * edges with the same label). For example for an application that stores IPv4 | ||
3051 | * addresses as bitstrings it could make sense to limit the path compression to | ||
3052 | * 4 or 8. | ||
3053 | * | ||
3054 | * @param regex regular expression string. | ||
3055 | * @param len length of the regular expression. | ||
3056 | * @param max_path_len limit the path compression length to the | ||
3057 | * given value. If set to 1, no path compression is applied. Set to 0 for | ||
3058 | * maximal possible path compression (generally not desirable). | ||
3059 | * @return DFA, needs to be freed using REGEX_INTERNAL_automaton_destroy. | ||
3060 | */ | ||
3061 | struct REGEX_INTERNAL_Automaton * | ||
3062 | REGEX_INTERNAL_construct_dfa (const char *regex, | ||
3063 | const size_t len, | ||
3064 | unsigned int max_path_len) | ||
3065 | { | ||
3066 | struct REGEX_INTERNAL_Context ctx; | ||
3067 | struct REGEX_INTERNAL_Automaton *dfa; | ||
3068 | struct REGEX_INTERNAL_Automaton *nfa; | ||
3069 | struct REGEX_INTERNAL_StateSet nfa_start_eps_cls; | ||
3070 | struct REGEX_INTERNAL_StateSet singleton_set; | ||
3071 | |||
3072 | REGEX_INTERNAL_context_init (&ctx); | ||
3073 | |||
3074 | /* Create NFA */ | ||
3075 | nfa = REGEX_INTERNAL_construct_nfa (regex, len); | ||
3076 | |||
3077 | if (NULL == nfa) | ||
3078 | { | ||
3079 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
3080 | "Could not create DFA, because NFA creation failed\n"); | ||
3081 | return NULL; | ||
3082 | } | ||
3083 | |||
3084 | dfa = GNUNET_new (struct REGEX_INTERNAL_Automaton); | ||
3085 | dfa->type = DFA; | ||
3086 | dfa->regex = GNUNET_strdup (regex); | ||
3087 | |||
3088 | /* Create DFA start state from epsilon closure */ | ||
3089 | memset (&singleton_set, 0, sizeof(struct REGEX_INTERNAL_StateSet)); | ||
3090 | state_set_append (&singleton_set, nfa->start); | ||
3091 | nfa_closure_set_create (&nfa_start_eps_cls, nfa, &singleton_set, NULL); | ||
3092 | state_set_clear (&singleton_set); | ||
3093 | dfa->start = dfa_state_create (&ctx, &nfa_start_eps_cls); | ||
3094 | automaton_add_state (dfa, dfa->start); | ||
3095 | |||
3096 | construct_dfa_states (&ctx, nfa, dfa, dfa->start); | ||
3097 | REGEX_INTERNAL_automaton_destroy (nfa); | ||
3098 | |||
3099 | /* Minimize DFA */ | ||
3100 | if (GNUNET_OK != dfa_minimize (&ctx, dfa)) | ||
3101 | { | ||
3102 | REGEX_INTERNAL_automaton_destroy (dfa); | ||
3103 | return NULL; | ||
3104 | } | ||
3105 | |||
3106 | /* Create proofs and hashes for all states */ | ||
3107 | if (GNUNET_OK != automaton_create_proofs (dfa)) | ||
3108 | { | ||
3109 | REGEX_INTERNAL_automaton_destroy (dfa); | ||
3110 | return NULL; | ||
3111 | } | ||
3112 | |||
3113 | /* Compress linear DFA paths */ | ||
3114 | if (1 != max_path_len) | ||
3115 | dfa_compress_paths (&ctx, dfa, max_path_len); | ||
3116 | |||
3117 | return dfa; | ||
3118 | } | ||
3119 | |||
3120 | |||
3121 | /** | ||
3122 | * Free the memory allocated by constructing the REGEX_INTERNAL_Automaton data | ||
3123 | * structure. | ||
3124 | * | ||
3125 | * @param a automaton to be destroyed | ||
3126 | */ | ||
3127 | void | ||
3128 | REGEX_INTERNAL_automaton_destroy (struct REGEX_INTERNAL_Automaton *a) | ||
3129 | { | ||
3130 | struct REGEX_INTERNAL_State *s; | ||
3131 | struct REGEX_INTERNAL_State *next_state; | ||
3132 | |||
3133 | if (NULL == a) | ||
3134 | return; | ||
3135 | |||
3136 | GNUNET_free (a->regex); | ||
3137 | GNUNET_free (a->canonical_regex); | ||
3138 | |||
3139 | for (s = a->states_head; NULL != s; s = next_state) | ||
3140 | { | ||
3141 | next_state = s->next; | ||
3142 | GNUNET_CONTAINER_DLL_remove (a->states_head, a->states_tail, s); | ||
3143 | automaton_destroy_state (s); | ||
3144 | } | ||
3145 | |||
3146 | GNUNET_free (a); | ||
3147 | } | ||
3148 | |||
3149 | |||
3150 | /** | ||
3151 | * Evaluates the given string using the given DFA automaton | ||
3152 | * | ||
3153 | * @param a automaton, type must be DFA | ||
3154 | * @param string string that should be evaluated | ||
3155 | * | ||
3156 | * @return 0 if string matches, non-0 otherwise | ||
3157 | */ | ||
3158 | static int | ||
3159 | evaluate_dfa (struct REGEX_INTERNAL_Automaton *a, const char *string) | ||
3160 | { | ||
3161 | const char *strp; | ||
3162 | struct REGEX_INTERNAL_State *s; | ||
3163 | unsigned int step_len; | ||
3164 | |||
3165 | if (DFA != a->type) | ||
3166 | { | ||
3167 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
3168 | "Tried to evaluate DFA, but NFA automaton given"); | ||
3169 | return -1; | ||
3170 | } | ||
3171 | |||
3172 | s = a->start; | ||
3173 | |||
3174 | /* If the string is empty but the starting state is accepting, we accept. */ | ||
3175 | if (((NULL == string) || (0 == strlen (string))) && s->accepting) | ||
3176 | return 0; | ||
3177 | |||
3178 | for (strp = string; NULL != strp && *strp; strp += step_len) | ||
3179 | { | ||
3180 | step_len = dfa_move (&s, strp); | ||
3181 | |||
3182 | if (NULL == s) | ||
3183 | break; | ||
3184 | } | ||
3185 | |||
3186 | if ((NULL != s) && s->accepting) | ||
3187 | return 0; | ||
3188 | |||
3189 | return 1; | ||
3190 | } | ||
3191 | |||
3192 | |||
3193 | /** | ||
3194 | * Evaluates the given string using the given NFA automaton | ||
3195 | * | ||
3196 | * @param a automaton, type must be NFA | ||
3197 | * @param string string that should be evaluated | ||
3198 | * @return 0 if string matches, non-0 otherwise | ||
3199 | */ | ||
3200 | static int | ||
3201 | evaluate_nfa (struct REGEX_INTERNAL_Automaton *a, const char *string) | ||
3202 | { | ||
3203 | const char *strp; | ||
3204 | char str[2]; | ||
3205 | struct REGEX_INTERNAL_State *s; | ||
3206 | struct REGEX_INTERNAL_StateSet sset; | ||
3207 | struct REGEX_INTERNAL_StateSet new_sset; | ||
3208 | struct REGEX_INTERNAL_StateSet singleton_set; | ||
3209 | unsigned int i; | ||
3210 | int result; | ||
3211 | |||
3212 | if (NFA != a->type) | ||
3213 | { | ||
3214 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
3215 | "Tried to evaluate NFA, but DFA automaton given"); | ||
3216 | return -1; | ||
3217 | } | ||
3218 | |||
3219 | /* If the string is empty but the starting state is accepting, we accept. */ | ||
3220 | if (((NULL == string) || (0 == strlen (string))) && a->start->accepting) | ||
3221 | return 0; | ||
3222 | |||
3223 | result = 1; | ||
3224 | memset (&singleton_set, 0, sizeof(struct REGEX_INTERNAL_StateSet)); | ||
3225 | state_set_append (&singleton_set, a->start); | ||
3226 | nfa_closure_set_create (&sset, a, &singleton_set, NULL); | ||
3227 | state_set_clear (&singleton_set); | ||
3228 | |||
3229 | str[1] = '\0'; | ||
3230 | for (strp = string; NULL != strp && *strp; strp++) | ||
3231 | { | ||
3232 | str[0] = *strp; | ||
3233 | nfa_closure_set_create (&new_sset, a, &sset, str); | ||
3234 | state_set_clear (&sset); | ||
3235 | nfa_closure_set_create (&sset, a, &new_sset, 0); | ||
3236 | state_set_clear (&new_sset); | ||
3237 | } | ||
3238 | |||
3239 | for (i = 0; i < sset.off; i++) | ||
3240 | { | ||
3241 | s = sset.states[i]; | ||
3242 | if ((NULL != s) && (s->accepting)) | ||
3243 | { | ||
3244 | result = 0; | ||
3245 | break; | ||
3246 | } | ||
3247 | } | ||
3248 | |||
3249 | state_set_clear (&sset); | ||
3250 | return result; | ||
3251 | } | ||
3252 | |||
3253 | |||
3254 | /** | ||
3255 | * Evaluates the given @a string against the given compiled regex @a a | ||
3256 | * | ||
3257 | * @param a automaton | ||
3258 | * @param string string to check | ||
3259 | * @return 0 if string matches, non-0 otherwise | ||
3260 | */ | ||
3261 | int | ||
3262 | REGEX_INTERNAL_eval (struct REGEX_INTERNAL_Automaton *a, const char *string) | ||
3263 | { | ||
3264 | int result; | ||
3265 | |||
3266 | switch (a->type) | ||
3267 | { | ||
3268 | case DFA: | ||
3269 | result = evaluate_dfa (a, string); | ||
3270 | break; | ||
3271 | |||
3272 | case NFA: | ||
3273 | result = evaluate_nfa (a, string); | ||
3274 | break; | ||
3275 | |||
3276 | default: | ||
3277 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, | ||
3278 | "Evaluating regex failed, automaton has no type!\n"); | ||
3279 | result = GNUNET_SYSERR; | ||
3280 | break; | ||
3281 | } | ||
3282 | |||
3283 | return result; | ||
3284 | } | ||
3285 | |||
3286 | |||
3287 | /** | ||
3288 | * Get the canonical regex of the given automaton. | ||
3289 | * When constructing the automaton a proof is computed for each state, | ||
3290 | * consisting of the regular expression leading to this state. A complete | ||
3291 | * regex for the automaton can be computed by combining these proofs. | ||
3292 | * As of now this function is only useful for testing. | ||
3293 | * | ||
3294 | * @param a automaton for which the canonical regex should be returned. | ||
3295 | * | ||
3296 | * @return | ||
3297 | */ | ||
3298 | const char * | ||
3299 | REGEX_INTERNAL_get_canonical_regex (struct REGEX_INTERNAL_Automaton *a) | ||
3300 | { | ||
3301 | if (NULL == a) | ||
3302 | return NULL; | ||
3303 | |||
3304 | return a->canonical_regex; | ||
3305 | } | ||
3306 | |||
3307 | |||
3308 | /** | ||
3309 | * Get the number of transitions that are contained in the given automaton. | ||
3310 | * | ||
3311 | * @param a automaton for which the number of transitions should be returned. | ||
3312 | * | ||
3313 | * @return number of transitions in the given automaton. | ||
3314 | */ | ||
3315 | unsigned int | ||
3316 | REGEX_INTERNAL_get_transition_count (struct REGEX_INTERNAL_Automaton *a) | ||
3317 | { | ||
3318 | unsigned int t_count; | ||
3319 | struct REGEX_INTERNAL_State *s; | ||
3320 | |||
3321 | if (NULL == a) | ||
3322 | return 0; | ||
3323 | |||
3324 | t_count = 0; | ||
3325 | for (s = a->states_head; NULL != s; s = s->next) | ||
3326 | t_count += s->transition_count; | ||
3327 | |||
3328 | return t_count; | ||
3329 | } | ||
3330 | |||
3331 | |||
3332 | /** | ||
3333 | * Get the first key for the given @a input_string. This hashes the first x bits | ||
3334 | * of the @a input_string. | ||
3335 | * | ||
3336 | * @param input_string string. | ||
3337 | * @param string_len length of the @a input_string. | ||
3338 | * @param key pointer to where to write the hash code. | ||
3339 | * @return number of bits of @a input_string that have been consumed | ||
3340 | * to construct the key | ||
3341 | */ | ||
3342 | size_t | ||
3343 | REGEX_INTERNAL_get_first_key (const char *input_string, | ||
3344 | size_t string_len, | ||
3345 | struct GNUNET_HashCode *key) | ||
3346 | { | ||
3347 | size_t size; | ||
3348 | |||
3349 | size = string_len < GNUNET_REGEX_INITIAL_BYTES ? string_len | ||
3350 | : GNUNET_REGEX_INITIAL_BYTES; | ||
3351 | if (NULL == input_string) | ||
3352 | { | ||
3353 | GNUNET_log (GNUNET_ERROR_TYPE_ERROR, "Given input string was NULL!\n"); | ||
3354 | return 0; | ||
3355 | } | ||
3356 | GNUNET_CRYPTO_hash (input_string, size, key); | ||
3357 | |||
3358 | return size; | ||
3359 | } | ||
3360 | |||
3361 | |||
3362 | /** | ||
3363 | * Recursive function that calls the iterator for each synthetic start state. | ||
3364 | * | ||
3365 | * @param min_len minimum length of the path in the graph. | ||
3366 | * @param max_len maximum length of the path in the graph. | ||
3367 | * @param consumed_string string consumed by traversing the graph till this state. | ||
3368 | * @param state current state of the automaton. | ||
3369 | * @param iterator iterator function called for each edge. | ||
3370 | * @param iterator_cls closure for the @a iterator function. | ||
3371 | */ | ||
3372 | static void | ||
3373 | iterate_initial_edge (unsigned int min_len, | ||
3374 | unsigned int max_len, | ||
3375 | char *consumed_string, | ||
3376 | struct REGEX_INTERNAL_State *state, | ||
3377 | REGEX_INTERNAL_KeyIterator iterator, | ||
3378 | void *iterator_cls) | ||
3379 | { | ||
3380 | char *temp; | ||
3381 | struct REGEX_INTERNAL_Transition *t; | ||
3382 | unsigned int num_edges = state->transition_count; | ||
3383 | struct REGEX_BLOCK_Edge edges[num_edges]; | ||
3384 | struct REGEX_BLOCK_Edge edge[1]; | ||
3385 | struct GNUNET_HashCode hash; | ||
3386 | struct GNUNET_HashCode hash_new; | ||
3387 | unsigned int cur_len; | ||
3388 | |||
3389 | if (NULL != consumed_string) | ||
3390 | cur_len = strlen (consumed_string); | ||
3391 | else | ||
3392 | cur_len = 0; | ||
3393 | |||
3394 | if (((cur_len >= min_len) || (GNUNET_YES == state->accepting)) && | ||
3395 | (cur_len > 0) && (NULL != consumed_string)) | ||
3396 | { | ||
3397 | if (cur_len <= max_len) | ||
3398 | { | ||
3399 | if ((NULL != state->proof) && | ||
3400 | (0 != strcmp (consumed_string, state->proof))) | ||
3401 | { | ||
3402 | (void) state_get_edges (state, edges); | ||
3403 | GNUNET_CRYPTO_hash (consumed_string, strlen (consumed_string), &hash); | ||
3404 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, | ||
3405 | "Start state for string `%s' is %s\n", | ||
3406 | consumed_string, | ||
3407 | GNUNET_h2s (&hash)); | ||
3408 | iterator (iterator_cls, | ||
3409 | &hash, | ||
3410 | consumed_string, | ||
3411 | state->accepting, | ||
3412 | num_edges, | ||
3413 | edges); | ||
3414 | } | ||
3415 | |||
3416 | if ((GNUNET_YES == state->accepting) && (cur_len > 1) && | ||
3417 | (state->transition_count < 1) && (cur_len < max_len)) | ||
3418 | { | ||
3419 | /* Special case for regex consisting of just a string that is shorter than | ||
3420 | * max_len */ | ||
3421 | edge[0].label = &consumed_string[cur_len - 1]; | ||
3422 | edge[0].destination = state->hash; | ||
3423 | temp = GNUNET_strdup (consumed_string); | ||
3424 | temp[cur_len - 1] = '\0'; | ||
3425 | GNUNET_CRYPTO_hash (temp, cur_len - 1, &hash_new); | ||
3426 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, | ||
3427 | "Start state for short string `%s' is %s\n", | ||
3428 | temp, | ||
3429 | GNUNET_h2s (&hash_new)); | ||
3430 | iterator (iterator_cls, &hash_new, temp, GNUNET_NO, 1, edge); | ||
3431 | GNUNET_free (temp); | ||
3432 | } | ||
3433 | } | ||
3434 | else /* cur_len > max_len */ | ||
3435 | { | ||
3436 | /* Case where the concatenated labels are longer than max_len, then split. */ | ||
3437 | edge[0].label = &consumed_string[max_len]; | ||
3438 | edge[0].destination = state->hash; | ||
3439 | temp = GNUNET_strdup (consumed_string); | ||
3440 | temp[max_len] = '\0'; | ||
3441 | GNUNET_CRYPTO_hash (temp, max_len, &hash); | ||
3442 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, | ||
3443 | "Start state at split edge `%s'-`%s` is %s\n", | ||
3444 | temp, | ||
3445 | edge[0].label, | ||
3446 | GNUNET_h2s (&hash_new)); | ||
3447 | iterator (iterator_cls, &hash, temp, GNUNET_NO, 1, edge); | ||
3448 | GNUNET_free (temp); | ||
3449 | } | ||
3450 | } | ||
3451 | |||
3452 | if (cur_len < max_len) | ||
3453 | { | ||
3454 | for (t = state->transitions_head; NULL != t; t = t->next) | ||
3455 | { | ||
3456 | if (NULL != strchr (t->label, (int) '.')) | ||
3457 | { | ||
3458 | /* Wildcards not allowed during starting states */ | ||
3459 | GNUNET_break (0); | ||
3460 | continue; | ||
3461 | } | ||
3462 | if (NULL != consumed_string) | ||
3463 | GNUNET_asprintf (&temp, "%s%s", consumed_string, t->label); | ||
3464 | else | ||
3465 | GNUNET_asprintf (&temp, "%s", t->label); | ||
3466 | iterate_initial_edge (min_len, | ||
3467 | max_len, | ||
3468 | temp, | ||
3469 | t->to_state, | ||
3470 | iterator, | ||
3471 | iterator_cls); | ||
3472 | GNUNET_free (temp); | ||
3473 | } | ||
3474 | } | ||
3475 | } | ||
3476 | |||
3477 | |||
3478 | /** | ||
3479 | * Iterate over all edges starting from start state of automaton 'a'. Calling | ||
3480 | * iterator for each edge. | ||
3481 | * | ||
3482 | * @param a automaton. | ||
3483 | * @param iterator iterator called for each edge. | ||
3484 | * @param iterator_cls closure. | ||
3485 | */ | ||
3486 | void | ||
3487 | REGEX_INTERNAL_iterate_all_edges (struct REGEX_INTERNAL_Automaton *a, | ||
3488 | REGEX_INTERNAL_KeyIterator iterator, | ||
3489 | void *iterator_cls) | ||
3490 | { | ||
3491 | struct REGEX_INTERNAL_State *s; | ||
3492 | |||
3493 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Iterating over starting edges\n"); | ||
3494 | iterate_initial_edge (GNUNET_REGEX_INITIAL_BYTES, | ||
3495 | GNUNET_REGEX_INITIAL_BYTES, | ||
3496 | NULL, | ||
3497 | a->start, | ||
3498 | iterator, | ||
3499 | iterator_cls); | ||
3500 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, "Iterating over DFA edges\n"); | ||
3501 | for (s = a->states_head; NULL != s; s = s->next) | ||
3502 | { | ||
3503 | struct REGEX_BLOCK_Edge edges[s->transition_count]; | ||
3504 | unsigned int num_edges; | ||
3505 | |||
3506 | num_edges = state_get_edges (s, edges); | ||
3507 | if (((NULL != s->proof) && (0 < strlen (s->proof))) || s->accepting) | ||
3508 | { | ||
3509 | GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, | ||
3510 | "Creating DFA edges at `%s' under key %s\n", | ||
3511 | s->proof, | ||
3512 | GNUNET_h2s (&s->hash)); | ||
3513 | iterator (iterator_cls, | ||
3514 | &s->hash, | ||
3515 | s->proof, | ||
3516 | s->accepting, | ||
3517 | num_edges, | ||
3518 | edges); | ||
3519 | } | ||
3520 | s->marked = GNUNET_NO; | ||
3521 | } | ||
3522 | } | ||
3523 | |||
3524 | |||
3525 | /** | ||
3526 | * Struct to hold all the relevant state information in the HashMap. | ||
3527 | * | ||
3528 | * Contains the same info as the Regex Iterator parameters except the key, | ||
3529 | * which comes directly from the HashMap iterator. | ||
3530 | */ | ||
3531 | struct temporal_state_store | ||
3532 | { | ||
3533 | int reachable; | ||
3534 | char *proof; | ||
3535 | int accepting; | ||
3536 | int num_edges; | ||
3537 | struct REGEX_BLOCK_Edge *edges; | ||
3538 | }; | ||
3539 | |||
3540 | |||
3541 | /** | ||
3542 | * Store regex iterator and cls in one place to pass to the hashmap iterator. | ||
3543 | */ | ||
3544 | struct client_iterator | ||
3545 | { | ||
3546 | REGEX_INTERNAL_KeyIterator iterator; | ||
3547 | void *iterator_cls; | ||
3548 | }; | ||
3549 | |||
3550 | |||
3551 | /** | ||
3552 | * Iterator over all edges of a dfa. Stores all of them in a HashMap | ||
3553 | * for later reachability marking. | ||
3554 | * | ||
3555 | * @param cls Closure (HashMap) | ||
3556 | * @param key hash for current state. | ||
3557 | * @param proof proof for current state | ||
3558 | * @param accepting GNUNET_YES if this is an accepting state, GNUNET_NO if not. | ||
3559 | * @param num_edges number of edges leaving current state. | ||
3560 | * @param edges edges leaving current state. | ||
3561 | */ | ||
3562 | static void | ||
3563 | store_all_states (void *cls, | ||
3564 | const struct GNUNET_HashCode *key, | ||
3565 | const char *proof, | ||
3566 | int accepting, | ||
3567 | unsigned int num_edges, | ||
3568 | const struct REGEX_BLOCK_Edge *edges) | ||
3569 | { | ||
3570 | struct GNUNET_CONTAINER_MultiHashMap *hm = cls; | ||
3571 | struct temporal_state_store *tmp; | ||
3572 | size_t edges_size; | ||
3573 | |||
3574 | tmp = GNUNET_new (struct temporal_state_store); | ||
3575 | tmp->reachable = GNUNET_NO; | ||
3576 | tmp->proof = GNUNET_strdup (proof); | ||
3577 | tmp->accepting = accepting; | ||
3578 | tmp->num_edges = num_edges; | ||
3579 | edges_size = sizeof(struct REGEX_BLOCK_Edge) * num_edges; | ||
3580 | tmp->edges = GNUNET_malloc (edges_size); | ||
3581 | GNUNET_memcpy (tmp->edges, edges, edges_size); | ||
3582 | GNUNET_assert (GNUNET_YES == | ||
3583 | GNUNET_CONTAINER_multihashmap_put ( | ||
3584 | hm, | ||
3585 | key, | ||
3586 | tmp, | ||
3587 | GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_FAST)); | ||
3588 | } | ||
3589 | |||
3590 | |||
3591 | /** | ||
3592 | * Mark state as reachable and call recursively on all its edges. | ||
3593 | * | ||
3594 | * If already marked as reachable, do nothing. | ||
3595 | * | ||
3596 | * @param state State to mark as reachable. | ||
3597 | * @param hm HashMap which stores all the states indexed by key. | ||
3598 | */ | ||
3599 | static void | ||
3600 | mark_as_reachable (struct temporal_state_store *state, | ||
3601 | struct GNUNET_CONTAINER_MultiHashMap *hm) | ||
3602 | { | ||
3603 | struct temporal_state_store *child; | ||
3604 | unsigned int i; | ||
3605 | |||
3606 | if (GNUNET_YES == state->reachable) | ||
3607 | /* visited */ | ||
3608 | return; | ||
3609 | |||
3610 | state->reachable = GNUNET_YES; | ||
3611 | for (i = 0; i < state->num_edges; i++) | ||
3612 | { | ||
3613 | child = | ||
3614 | GNUNET_CONTAINER_multihashmap_get (hm, &state->edges[i].destination); | ||
3615 | if (NULL == child) | ||
3616 | { | ||
3617 | GNUNET_break (0); | ||
3618 | continue; | ||
3619 | } | ||
3620 | mark_as_reachable (child, hm); | ||
3621 | } | ||
3622 | } | ||
3623 | |||
3624 | |||
3625 | /** | ||
3626 | * Iterator over hash map entries to mark the ones that are reachable. | ||
3627 | * | ||
3628 | * @param cls closure | ||
3629 | * @param key current key code | ||
3630 | * @param value value in the hash map | ||
3631 | * @return #GNUNET_YES if we should continue to iterate, | ||
3632 | * #GNUNET_NO if not. | ||
3633 | */ | ||
3634 | static int | ||
3635 | reachability_iterator (void *cls, | ||
3636 | const struct GNUNET_HashCode *key, | ||
3637 | void *value) | ||
3638 | { | ||
3639 | struct GNUNET_CONTAINER_MultiHashMap *hm = cls; | ||
3640 | struct temporal_state_store *state = value; | ||
3641 | |||
3642 | if (GNUNET_YES == state->reachable) | ||
3643 | /* already visited and marked */ | ||
3644 | return GNUNET_YES; | ||
3645 | |||
3646 | if ((GNUNET_REGEX_INITIAL_BYTES > strlen (state->proof)) && | ||
3647 | (GNUNET_NO == state->accepting) ) | ||
3648 | /* not directly reachable */ | ||
3649 | return GNUNET_YES; | ||
3650 | |||
3651 | mark_as_reachable (state, hm); | ||
3652 | return GNUNET_YES; | ||
3653 | } | ||
3654 | |||
3655 | |||
3656 | /** | ||
3657 | * Iterator over hash map entries. | ||
3658 | * Calling the callback on the ones marked as reachables. | ||
3659 | * | ||
3660 | * @param cls closure | ||
3661 | * @param key current key code | ||
3662 | * @param value value in the hash map | ||
3663 | * @return #GNUNET_YES if we should continue to iterate, | ||
3664 | * #GNUNET_NO if not. | ||
3665 | */ | ||
3666 | static int | ||
3667 | iterate_reachables (void *cls, const struct GNUNET_HashCode *key, void *value) | ||
3668 | { | ||
3669 | struct client_iterator *ci = cls; | ||
3670 | struct temporal_state_store *state = value; | ||
3671 | |||
3672 | if (GNUNET_YES == state->reachable) | ||
3673 | { | ||
3674 | ci->iterator (ci->iterator_cls, | ||
3675 | key, | ||
3676 | state->proof, | ||
3677 | state->accepting, | ||
3678 | state->num_edges, | ||
3679 | state->edges); | ||
3680 | } | ||
3681 | GNUNET_free (state->edges); | ||
3682 | GNUNET_free (state->proof); | ||
3683 | GNUNET_free (state); | ||
3684 | return GNUNET_YES; | ||
3685 | } | ||
3686 | |||
3687 | |||
3688 | /** | ||
3689 | * Iterate over all edges of automaton 'a' that are reachable from a state with | ||
3690 | * a proof of at least GNUNET_REGEX_INITIAL_BYTES characters. | ||
3691 | * | ||
3692 | * Call the iterator for each such edge. | ||
3693 | * | ||
3694 | * @param a automaton. | ||
3695 | * @param iterator iterator called for each reachable edge. | ||
3696 | * @param iterator_cls closure. | ||
3697 | */ | ||
3698 | void | ||
3699 | REGEX_INTERNAL_iterate_reachable_edges (struct REGEX_INTERNAL_Automaton *a, | ||
3700 | REGEX_INTERNAL_KeyIterator iterator, | ||
3701 | void *iterator_cls) | ||
3702 | { | ||
3703 | struct GNUNET_CONTAINER_MultiHashMap *hm; | ||
3704 | struct client_iterator ci; | ||
3705 | |||
3706 | hm = GNUNET_CONTAINER_multihashmap_create (a->state_count * 2, GNUNET_NO); | ||
3707 | ci.iterator = iterator; | ||
3708 | ci.iterator_cls = iterator_cls; | ||
3709 | |||
3710 | REGEX_INTERNAL_iterate_all_edges (a, &store_all_states, hm); | ||
3711 | GNUNET_CONTAINER_multihashmap_iterate (hm, &reachability_iterator, hm); | ||
3712 | GNUNET_CONTAINER_multihashmap_iterate (hm, &iterate_reachables, &ci); | ||
3713 | |||
3714 | GNUNET_CONTAINER_multihashmap_destroy (hm); | ||
3715 | } | ||
3716 | |||
3717 | |||
3718 | /* end of regex_internal.c */ | ||