aboutsummaryrefslogtreecommitdiff
path: root/gnu/gnunet/utils/tokeniser.scm
blob: 079de0aae748b7adefb127d3c53fe8c92c85cb14 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
;; This file is part of scheme-GNUnet, a Scheme port of GNUnet .
;;  Copyright (C) 2010, 2016, 2017, 2021 GNUnet e.V.
;;
;; GNUnet is free software: you can redistribute it and/or modify it
;; under the terms of the GNU Affero General Public License as published
;; by the Free Software Foundation, either version 3 of the License,
;;  or (at your option) any later version.
;;
;; GNUnet is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; Affero General Public License for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;;
;; SPDX-License-Identifier: AGPL3.0-or-later

;; C file: util/mst.c
;; Brief: convenience functions for handling inbound message buffers
;; Author: Christian Grothoff
;; Adapted to Scheme by Maxime Devos
;;
;; The most prominent use would be the implementation of message queues
;; over stream sockets, where the separate messages need to be split
;; from each other.  However, it is used in some other places as
;; well.
;;
;; The Scheme implementation does not support the 'purge' and 'one-shot'
;; modes. 'purge' should be simple to implement though and 'one-shot'
;; could be implemented with delimited continuations.
;;
;; The implementation avoids copying when possible.

(define-library (gnu gnunet utils tokeniser)
  (export make-tokeniser
	  tokeniser?

	  &interrupted-tokeniser-violation
	  make-interrupted-tokeniser-violation
	  interrupted-tokeniser-violation?

	  &kaput-tokeniser-error
	  make-kaput-tokeniser-error
	  kaput-tokeniser-error?

	  add-bytevector!
	  add-from-port!)
  (import (only (rnrs base)
		define and < assert begin quote lambda
		>= integer? exact? <= expt = cond
		let + - eq? > * min if)
	  (only (rnrs conditions)
		define-condition-type condition make-who-condition
		&violation &error)
	  (only (rnrs exceptions)
		raise)
	  (only (rnrs bytevectors)
		endianness bytevector-copy! bytevector? bytevector-length
		bytevector-u8-ref bytevector-u8-set! bytevector-u16-ref
		bytevector-u16-set! make-bytevector)
	  (only (rnrs records syntactic)
		define-record-type)
	  (only (srfi srfi-26) cut)
	  (only (guile) lambda*)
	  (only (ice-9 binary-ports) get-bytevector-some!)
	  (only (ice-9 ports) eof-object?)
	  (only (gnu gnunet util struct)
		/:message-header)
	  (only (gnu gnunet netstruct syntactic)
		sizeof)
	  (only (gnu gnunet utils hat-let) let^))
  (begin
    (define-condition-type &interrupted-tokeniser-violation &violation
      make-interrupted-tokeniser-violation interrupted-tokeniser-violation?)
    (define-condition-type &kaput-tokeniser-error &error
      make-kaput-tokeniser-error kaput-tokeniser-error?)

    (define-record-type
	(<tokeniser> make-tokeniser tokeniser-state?)
      ;; Current buffer.
      ;;
      ;; Alternatively, when @code{add-bytevector-copy!} is
      ;; being called, this is temporarily set to @code{#f},
      ;; to detect re-entrancy. And if a message with size
      ;; less than the message header is found, then it is set
      ;; to @code{#t}, marking the tokeniser as ‘kaput’.
      (fields (mutable buffer tokeniser-buffer set-tokeniser-buffer!)
	      ;; Number of bytes in the buffer.
	      (mutable position tokeniser-position set-tokeniser-position!))
      (protocol
       (lambda (%make)
	 (lambda* (#:key
		   (initial-size (sizeof /:message-header '())))
	   "Make an empty tokeniser.  A buffer of size @var{initial-size}
will be pre-allocated.  This size must be an exact natural and it
might be adjusted."
	   (assert (and (integer? initial-size) (exact? initial-size)
			(>= initial-size 0)))
	   (%make (make-bytevector
		   (cond ((<= initial-size (sizeof /:message-header '()))
			  (sizeof /:message-header '()))
			 ((>= initial-size (expt 2 16))
			  (expt 2 16))
			 (#t initial-size)))
		  0))))
      (opaque #t)
      (sealed #t))

    (define (add-bytevector! tok bv offset length
			     handle/message
			     return/done
			     return/overly-small)
      "Feed up to @var{length} bytes from the bytevector @var{bv}
starting at @var{offset} to the tokeniser @var{tok}.

When a complete message is assembled, the callback @var{handle/message}
is called with an appropriate bytevector region.  This bytevector region
is part of the passed bytevector range (@var{bv}, @var{offset}, @var{length})
or the tokeniser's internal buffer.

If a message size was overly small, i.e., smaller than its header,
then @var{return/overly-small} is called in tail position with the
specified message type (as an integer) and message size.  In that case,
@var{tok} will be marked as kaput.  As the message type is not always
available, sometimes @code{#false} will be pased instead.

On success, @code{return/done} is called in tail position without
arguments.

This procedure may only be called if @var{tok} isn't kaput,
and it may not be called re-entrantly.  In the former case,
a @code{&kaput-tokeniser-error} is raised.  In the latter case,
a @code{&interrupted-tokeniser-violation} may be raised
but this cannot be guaranteed."
      ;; ^ mainly due to parallelism reasons
      (define set-buffer! (cut set-tokeniser-buffer! tok <>))
      (define set-position! (cut set-tokeniser-position! tok <>))
      (define mark-kaput! (cut set-tokeniser-buffer! tok #t))
      (define (maybe-reallocate/no-move buffer minimal-size)
	"Return a fresh bytevector or the bytevector @var{buffer} of
at least size @var{minimal-size}.  Avoid allocations."
	(if (<= minimal-size (bytevector-length buffer))
	    buffer
	    (make-bytevector minimal-size)))
      ;; Possibilities:
      ;;  (a) @var{length} is zero. Then there's nothing to do!
      ;;      The other possibilities will assume @var{length}
      ;;      is at least one.
      ;;
      ;;  (b) If the tokeniser buffer is empty and @var{bv} starts
      ;;      with a complete message, then call the processor
      ;;      on the message and continue.
      ;;
      ;;  (c) If the tokeniser buffer is empty and @var{bv} starts
      ;;      with an incomplete message, then copy the partial message
      ;;      to the tokeniser buffer (reallocating it if necessary)
      ;;      and stop.
      ;;
      ;;      If the message size is known, it is considered necessary
      ;;      for the tokeniser buffer to be at least that size.
      ;;
      ;;  (d) If the tokeniser buffer is non-empty and the message size
      ;;      of the partial message in that buffer is unknown,
      ;;      then determine the size, if necessary reallocate the tokeniser
      ;;      buffer to be at least that size, copy the size into the
      ;;      buffer and continue.
      ;;
      ;;      The message won't be complete yet, as a 'type' field always
      ;;      comes after the 'size' field in the message header.
      ;;
      ;;  (e) If the tokeniser buffer is non-empty and the message size
      ;;      of the partial message in that buffer is known,
      ;;      then copy the remainder of the message into the tokeniser
      ;;      buffer (as far as possible).
      ;;
      ;;      If this makes the message complete, then process the message
      ;;      and continue.  If the message isn't complete, then just stop,
      ;;      as all of @var{bv} has been copied.
      (define (continue buffer position bv offset length)
	;; The buffer is set before calls to return/done or return/overly-small.
	;; The position is set after it changes and before the tail-iteration
	;; into 'continue'.
	(cond ((= length 0)
	       (set-buffer! buffer)
	       (return/done)) ; possibility (a)
	      ((and (= position 0)
		    ;; possibility (c), length unknown
		    (< length 2))
	       (bytevector-u8-set! buffer position
				   (bytevector-u8-ref bv offset))
	       (set-buffer! buffer)
	       (set-position! 1)
	       (return/done))
	      ((= position 0) ; and (>= length 2)
	       (let ((size (bytevector-u16-ref bv offset (endianness big))))
		 (cond ((< size (sizeof /:message-header '()))
			(mark-kaput!)
			(return/overly-small
			 (and (>= length 4)
			      ;; + 2: skip the "size" field and read
			      ;; the 'type' field
			      (bytevector-u16-ref bv (+ offset 2)
						  (endianness big)))
			 size))
		       ;; possibility (b)
		       ((<= size length)
			(handle/message bv offset size)
			(continue buffer position bv (+ offset size)
				  (- length size)))
		       ;; Now, (< length size) -- possibility (c).
		       (#t
			(let ((buffer
			       ;; Re-allocate the buffer if required.
			       (maybe-reallocate/no-move buffer size)))
			  ;; Write the partial message to the buffer
			  (bytevector-copy! bv offset buffer 0 length)
			  (set-buffer! buffer)
			  (set-position! (+ position length))
			  (return/done))))))
	      ((>= position 2) ; possibility (e)
	       (let^ ((! size (bytevector-u16-ref buffer 0 (endianness big)))
		      (!! (<= (sizeof /:message-header '()) size))
		      (!! (<= size (bytevector-length buffer)))
		      (!! (< position size))
		      ;; How many bytes must be copied?
		      (! extra (min length (- size position)))
		      ;; Copy the bytes.
		      (_ (bytevector-copy! bv offset buffer position extra))
		      (! position (+ position extra))
		      (!! (<= position size))
		      ;; do not set the buffer yet, such that
		      ;; re-entrancy from the 'handle/message' callback
		      ;; can be detected.
		      (? (< position size)
			 ;; Message is not yet complete --> stop.
			 (assert (= length extra))
			 (set-buffer! buffer)
			 ;; some bytes have been copied
			 (set-position! position)
			 (return/done)))
		     ;; Message is complete --> process it and continue
		     ;; (there may be other messages as well!)
		     (handle/message buffer 0 size)
		     (set-position! 0)
		     (continue buffer 0 bv (+ offset extra) (- length extra))))
	      ;; (< position 2), possibility (d)
	      (#t
	       (let^ ((! size/byte-0 (bytevector-u8-ref buffer 0))
		      (! size/byte-1 (bytevector-u8-ref bv offset))
		      (! size (+ (* (expt 2 8) size/byte-0)
				 size/byte-1))
		      (? (< size (sizeof /:message-header '()))
			 (mark-kaput!)
			 (return/overly-small
			  (and (>= length 3)
			       (bytevector-u16-ref bv (+ offset 1)
						   (endianness big)))
			  size))
		      (! buffer (maybe-reallocate/no-move buffer size)))
		     (bytevector-u16-set! buffer 0 size (endianness big))
		     (set-position! 2)
		     (continue buffer 2 bv (+ offset 1) (- length 1))))))
      (let^ ((! buffer (tokeniser-buffer tok))
	     (! position (tokeniser-position tok))
	     (? (eq? buffer #t)
		(raise (condition
			(make-who-condition 'add-bytevector!)
			(make-kaput-tokeniser-error))))
	     (? (eq? buffer #f)
		(raise (condition
			(make-who-condition 'add-bytevector!)
			(make-interrupted-tokeniser-violation))))
	     (!! (and (bytevector? buffer)
		      (integer? position)
		      (exact? position)
		      (integer? offset)
		      (exact? offset)
		      (integer? length)
		      (exact? length)
		      (<= (+ offset length) (bytevector-length bv))
		      (<= 0 position)
		      (< position (bytevector-length buffer)))))
	    ;; The buffer will be restored at the call to
	    ;; 'return/done' or 'return/overly-small'.
	    (set-buffer! #f)
	    (continue buffer position bv offset length)))

    (define (add-from-port! tok port handle/message return/overly-small
			    return/done-eof return/premature-eof)
      "Keep reading data from the input port @var{port}, feeding them
to the tokeniser @var{tok}.

The procedures @var{handle/message}, and @var{return/overly-small} are used
as in @code{add-bytevector!}.  When the end of file has been reached, and
@var{tok} doesn't hold a partial message, the thunk @var{return/done-eof}
is called in tail position.  When the end of file has been reached, and
@var{tok} does still hold a partial message, the thunk
@var{return/premature-eof} is instead called in tail position.

As with @ode{add-bytevector!}, @code{&kaput-tokeniser-error} and
@code{&interrupted-tokeniser-violation} can be raised.

This is a blocking operation!."
      ;; Cheaty, but it works!  I'd presume Guile or glibc have an
      ;; optimisation for copying a memory region to itself.  Also,
      ;; this saves a buffer allocation.
      (let^ ((! buffer (tokeniser-buffer tok))
	     (! position (tokeniser-position tok))
	     (? (eq? buffer #t)
		(raise (condition
			(make-who-condition 'add-from-port!)
			(make-kaput-tokeniser-error))))
	     (? (eq? buffer #f)
		(raise (condition
			(make-who-condition 'add-from-port!)
			(make-interrupted-tokeniser-violation))))
	     (! length (- (bytevector-length buffer) position))
	     (! n/read (get-bytevector-some! port buffer position length))
	     (? (eof-object? n/read)
		;; If 'position' is 0, then there was no incomplete
		;; message in the tokeniser.
		((if (= position 0) return/done-eof return/premature-eof)))
	     (! (return/add-bytevector!-done)
		(add-from-port! tok port handle/message return/overly-small
				return/done-eof return/premature-eof)))
	    (add-bytevector! tok buffer position n/read handle/message
			     return/add-bytevector!-done
			     return/overly-small)))))