1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
;; This file is part of scheme-GNUnet
;; Copyright (C) 2021 GNUnet e.V.
;;
;; scheme-GNUnet is free software: you can redistribute it and/or modify it
;; under the terms of the GNU Affero General Public License as published
;; by the Free Software Foundation, either version 3 of the License,
;; or (at your option) any later version.
;;
;; GNUnet is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; Affero General Public License for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;
;; SPDX-License-Identifier: AGPL-3.0-or-later
;; TODO: look into integrating this into Guile proper.
(define-module (web form)
#:use-module (rnrs bytevectors)
#:use-module (ice-9 match)
#:use-module (ice-9 binary-ports)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 control)
#:use-module (ice-9 string-fun)
#:export (urlencoded->alist))
;; application/x-www-form-urlencoded, documented in 8.2.1.
;; of RFC 1866
;; 8.2.1 ‘[...] space characters are replaced by #\+ [...]’
;;
;; Presumably only #\ is meant here and not the non-breaking space (NBSP),
;; otherwise NBSP could not be distinguished from the regular space character
;; #\ .
;;
;; 8.2.1 ‘[...] [non-alphanumeric] characters are replaced by %HH [...]’.
;;
;; Presumably with ‘non-alphanumeric’, ‘non-alphanumeric or non-ASCII’
;; is meant here, otherwise the validity of application/x-www-form-urlencoded
;; data could depend on the Unicode standard used.
;;
;; In practice, Firefox doesn't escape - and _, so include those as well
;; for compatibility.
;; TODO: isn't a-zA-Z0-9 problematic under some locales?
(define encoded-pat "^(\\+|[a-zA-Z0-9_-]|%[0-9A-F][0-9A-F])*$")
(define encoded-regex (make-regexp encoded-pat))
(define (try-utf8->string bv)
"Like utf8->string, but return #false instead of raising an error if
@var{bv} is not valid UTF-8."
(catch 'decoding-error
;; RFC 1866 doesn't specify the character encoding, so assume UTF-8.
(lambda () (utf8->string bv))
(lambda _ #false)))
(define (urlencoded-string->alist string)
(let/ec return
(let ()
(define (oops)
(return #false))
(when (string-null? string)
(return '()))
(define fields (string-split string #\&))
(define (unescape string)
;; Validate the syntax of STRING ...
(unless (regexp-exec encoded-regex string)
(oops))
;; ... replace #\+ with #\ ...
(define string-with-space (string-replace-substring string "+" " "))
(define bv
(call-with-output-bytevector
(lambda (port)
;; ... and undo % escapes.
(define (search remainder)
(define next-% (string-index remainder #\%))
(if next-%
(begin
(put-string port (substring remainder 0 next-%))
(undo-% (substring remainder next-%)))
(put-string port remainder)))
(define (undo-% remainder)
(define octet
(string->number (substring remainder 1 3) 16))
;; 8.2.1 ‘[...] [non-alphanumeric] characters are replaced by
;; %HH [...]’.
;;
;; The syntax of application/x-www-form-urlencoded is given in
;; terms of how to encode the fields, and alphanumeric characters
;; are not included there, thus alphanumeric characters are
;; forbidden.
(when (or (<= (char->integer #\a) octet (char->integer #\z))
(<= (char->integer #\A) octet (char->integer #\Z))
(<= (char->integer #\0) octet (char->integer #\9)))
(oops))
(put-u8 port octet)
(search (substring remainder 3)))
(search string-with-space))))
;; RFC 1866 doesn't specify the character encoding, so assume UTF-8.
;; The resulting bytevector could be bogus UTF-8, so catch
;; 'decoding-error'.
(or (try-utf8->string bv)
(oops)))
(define (decode-field field)
(match (string-split field #\=)
((escaped-field-name escaped-field-value)
(cons (unescape escaped-field-name) (unescape escaped-field-value)))
(_ (oops))))
(map decode-field fields))))
(define (urlencoded->alist body)
"Decode body, a bytevector holding a application/x-www-form-urlencoded,
to an association list of string-valued key-value pairs. Return #false
if the bytevector could not be parsed."
(and=> (try-utf8->string body) urlencoded-string->alist))
|