From 27b5cd8d33536fd005a185b745077a1942590bfe Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Mon, 10 Aug 2020 14:28:37 +0200 Subject: [PATCH 01/24] init json5-parser --- lib/json5/dune | 6 ++++++ lib/json5/lexer.mll | 9 +++++++++ lib/json5/types.ml | 7 +++++++ test/json5/dune | 4 ++++ test/json5/json5_test.ml | 13 +++++++++++++ yojson_json5.opam | 30 ++++++++++++++++++++++++++++++ 6 files changed, 69 insertions(+) create mode 100644 lib/json5/dune create mode 100644 lib/json5/lexer.mll create mode 100644 lib/json5/types.ml create mode 100644 test/json5/dune create mode 100644 test/json5/json5_test.ml create mode 100644 yojson_json5.opam diff --git a/lib/json5/dune b/lib/json5/dune new file mode 100644 index 00000000..c1a0c07f --- /dev/null +++ b/lib/json5/dune @@ -0,0 +1,6 @@ +(ocamllex lexer) + +(library + (name yojson_json5) + (public_name yojson_json5) + (preprocess (pps ppx_deriving.show))) diff --git a/lib/json5/lexer.mll b/lib/json5/lexer.mll new file mode 100644 index 00000000..bed1234f --- /dev/null +++ b/lib/json5/lexer.mll @@ -0,0 +1,9 @@ +{ + open Types +} + +rule read_token = parse + | "{" { OPEN_BRACE } + | "}" { CLOSE_BRACE } + | "[" { OPEN_BRACKET } + | "]" { CLOSE_BRACKET } diff --git a/lib/json5/types.ml b/lib/json5/types.ml new file mode 100644 index 00000000..343629ba --- /dev/null +++ b/lib/json5/types.ml @@ -0,0 +1,7 @@ +type token = + | OPEN_BRACE + | CLOSE_BRACE + | OPEN_BRACKET + | CLOSE_BRACKET + [@@deriving show] + diff --git a/test/json5/dune b/test/json5/dune new file mode 100644 index 00000000..d9b12ce2 --- /dev/null +++ b/test/json5/dune @@ -0,0 +1,4 @@ +(executable + (name json5_test) + (libraries yojson_json5) + (preprocess (pps ppx_deriving.show))) diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml new file mode 100644 index 00000000..6e9c5c11 --- /dev/null +++ b/test/json5/json5_test.ml @@ -0,0 +1,13 @@ +module Y = Yojson_json5 + +type token_list = Y.Types.token list [@@deriving show] + +let () = + let lex_buffer = Lexing.from_string "{}" in + let result = Y.Lexer.read_token lex_buffer in + Y.Types.show_token result + |> print_endline; + + let result = Y.Lexer.read_token lex_buffer in + Y.Types.show_token result + |> print_endline diff --git a/yojson_json5.opam b/yojson_json5.opam new file mode 100644 index 00000000..64dd13f2 --- /dev/null +++ b/yojson_json5.opam @@ -0,0 +1,30 @@ +opam-version: "2.0" +maintainer: ["nathan@cryptosense.com" "marek@xivilization.net"] +authors: ["Martin Jambon"] +homepage: "https://github.com/ocaml-community/yojson" +bug-reports: "https://github.com/ocaml-community/yojson/issues" +dev-repo: "git+https://github.com/ocaml-community/yojson.git" +doc: "https://ocaml-community.github.io/yojson/" +build: [ + ["dune" "subst"] {pinned} + ["dune" "build" "-p" name "-j" jobs] +] +run-test: [["dune" "runtest" "-p" name "-j" jobs]] +depends: [ + "ocaml" {>= "4.02.3"} + "dune" + "alcotest" {with-test & >= "0.8.5"} +] +synopsis: + "Yojson is an optimized parsing and printing library for the JSON format" +description: """ +Yojson is an optimized parsing and printing library for the JSON format. + +It addresses a few shortcomings of json-wheel including 2x speedup, +polymorphic variants and optional syntax for tuples and variants. + +ydump is a pretty-printing command-line program provided with the +yojson package. + +The program atdgen can be used to derive OCaml-JSON serializers and +deserializers from type definitions.""" From 92e2615160f0f1c17095106e0bb79d6f1f4c8e0b Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Mon, 10 Aug 2020 17:00:30 +0200 Subject: [PATCH 02/24] add numbers --- lib/json5/lexer.mll | 32 ++++++++++++++++++++++++++++++++ lib/json5/types.ml | 6 ++++++ test/json5/json5_test.ml | 18 +++++++++++------- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/lib/json5/lexer.mll b/lib/json5/lexer.mll index bed1234f..e0e5bfb0 100644 --- a/lib/json5/lexer.mll +++ b/lib/json5/lexer.mll @@ -2,8 +2,40 @@ open Types } +(* From https://www.ecma-international.org/ecma-262/5.1/#sec-7 *) + +(* NUMBERS, 7.8.3 *) +let non_zero_digit = ['1'-'9'] +let decimal_digit = ['0'-'9'] +let decimal_digits = decimal_digit+ +let hex_digit = [ '0'-'9' 'a'-'f' 'A'-'F' ] +let exponent_indicator = ( 'e' | 'E' ) +let signed_integer = ( decimal_digits | '+' decimal_digits | '-' decimal_digits ) +let exponent_part = exponent_indicator signed_integer +let decimal_integer_literal = ( '0' | non_zero_digit decimal_digits? ) +let decimal_literal = ( + decimal_integer_literal '.' decimal_digits? exponent_part? + | '.' decimal_digits exponent_part? + | decimal_integer_literal exponent_part? +) +let hex_integer_literal = ( "0x" hex_digit+ | "0X" hex_digit+ ) +let numeric_literal = ( decimal_literal | hex_integer_literal ) +let json5_numeric_literal = ( numeric_literal | "Infinity" | "NaN" ) +let json5_number = (json5_numeric_literal | '+' json5_numeric_literal | '-' json5_numeric_literal ) + +(* STRINGS, 7.8.4 *) + rule read_token = parse | "{" { OPEN_BRACE } | "}" { CLOSE_BRACE } | "[" { OPEN_BRACKET } | "]" { CLOSE_BRACKET } + | ":" { COLON } + | "," { COMMA } + | "true" { TRUE } + | "false" { FALSE } + | "null" { NULL } + | json5_number { + let s = Lexing.lexeme lexbuf in + NUMBER s + } diff --git a/lib/json5/types.ml b/lib/json5/types.ml index 343629ba..ffe51c87 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -3,5 +3,11 @@ type token = | CLOSE_BRACE | OPEN_BRACKET | CLOSE_BRACKET + | COLON + | COMMA + | TRUE + | FALSE + | NULL + | NUMBER of string [@@deriving show] diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index 6e9c5c11..92d2de13 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -3,11 +3,15 @@ module Y = Yojson_json5 type token_list = Y.Types.token list [@@deriving show] let () = - let lex_buffer = Lexing.from_string "{}" in - let result = Y.Lexer.read_token lex_buffer in - Y.Types.show_token result - |> print_endline; + let lex_buffer = Lexing.from_string "{[0x42e13]}" in + + let rec loop lex_buf token_list = + match Y.Lexer.read_token lex_buf with + | exception Failure _ -> token_list + | token -> loop lex_buf (token::token_list) + in - let result = Y.Lexer.read_token lex_buffer in - Y.Types.show_token result - |> print_endline + let result = loop lex_buffer [] in + List.rev result + |> show_token_list + |> print_endline; From 912cea1a0310fc36db4a893775c6477a568a9b6f Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Thu, 29 Oct 2020 13:08:11 +0100 Subject: [PATCH 03/24] not sure what is here --- lib/json5/lexer.mll | 41 ++++++++++++++++++++++++++++++---------- lib/json5/types.ml | 6 +++++- test/json5/json5_test.ml | 40 ++++++++++++++++++++++++++++----------- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/lib/json5/lexer.mll b/lib/json5/lexer.mll index e0e5bfb0..7435638b 100644 --- a/lib/json5/lexer.mll +++ b/lib/json5/lexer.mll @@ -13,15 +13,23 @@ let exponent_indicator = ( 'e' | 'E' ) let signed_integer = ( decimal_digits | '+' decimal_digits | '-' decimal_digits ) let exponent_part = exponent_indicator signed_integer let decimal_integer_literal = ( '0' | non_zero_digit decimal_digits? ) -let decimal_literal = ( - decimal_integer_literal '.' decimal_digits? exponent_part? - | '.' decimal_digits exponent_part? - | decimal_integer_literal exponent_part? -) let hex_integer_literal = ( "0x" hex_digit+ | "0X" hex_digit+ ) -let numeric_literal = ( decimal_literal | hex_integer_literal ) -let json5_numeric_literal = ( numeric_literal | "Infinity" | "NaN" ) -let json5_number = (json5_numeric_literal | '+' json5_numeric_literal | '-' json5_numeric_literal ) +(* float *) +let float_literal = ( decimal_integer_literal '.' decimal_digits? exponent_part? | '.' decimal_digits exponent_part? ) +let json5_float = ( float_literal | '+' float_literal | '-' float_literal ) +(* int_or_float *) +let int_or_float_literal = decimal_integer_literal exponent_part? +let json5_int_or_float = ( int_or_float_literal | '+' int_or_float_literal | '-' int_or_float_literal ) +(* int/hex *) +let json5_int = ( hex_integer_literal | '+' hex_integer_literal | '-' hex_integer_literal ) + +(* IDENTIFIER_NAME (keys in objects) *) +let unicode_escape_squence = 'u' hex_digit hex_digit hex_digit hex_digit +let unicode_letter = [ 'a'-'z' 'A'-'F' ] +let identifier_start = ( unicode_letter | '$' | '_' | '\\' unicode_escape_squence ) +let identifier_part = ( identifier_start | decimal_digits ) (* unicode_combining_mark, unicode_connector_punctuation, ZWNJ and NWJ missing *) +let identifier_name = identifier_start identifier_part+? + (* STRINGS, 7.8.4 *) @@ -35,7 +43,20 @@ rule read_token = parse | "true" { TRUE } | "false" { FALSE } | "null" { NULL } - | json5_number { + | " " { SPACE } + | json5_float { + let s = float_of_string @@ Lexing.lexeme lexbuf in + FLOAT s + } + | json5_int_or_float { + let s = Lexing.lexeme lexbuf in + INT_OR_FLOAT s + } + | json5_int { + let s = int_of_string @@ Lexing.lexeme lexbuf in + INT s + } + | identifier_name { let s = Lexing.lexeme lexbuf in - NUMBER s + IDENTIFIER_NAME s } diff --git a/lib/json5/types.ml b/lib/json5/types.ml index ffe51c87..b2742fe5 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -8,6 +8,10 @@ type token = | TRUE | FALSE | NULL - | NUMBER of string + | SPACE + | FLOAT of float + | INT_OR_FLOAT of string + | INT of int + | IDENTIFIER_NAME of string [@@deriving show] diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index 92d2de13..eed0cbe2 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -1,17 +1,35 @@ -module Y = Yojson_json5 +module Lexer = Yojson_json5.Lexer +open Yojson_json5.Types + +type token_list = token list [@@deriving show] -type token_list = Y.Types.token list [@@deriving show] let () = - let lex_buffer = Lexing.from_string "{[0x42e13]}" in + let check_json (name, json_string, expected) = + let lex_buffer = Lexing.from_string json_string in - let rec loop lex_buf token_list = - match Y.Lexer.read_token lex_buf with - | exception Failure _ -> token_list - | token -> loop lex_buf (token::token_list) + let rec loop lex_buf token_list = + match Lexer.read_token lex_buf with + | exception Failure _ -> token_list + | token -> loop lex_buf (token::token_list) + in + let result = loop lex_buffer [] + |> List.rev + |> show_token_list + in + let expected = show_token_list expected in + if result <> expected then + print_string @@ name ^ " failed:\n\nInput:\n" ^ result ^ "\n\nExpected:\n" ^ expected in - let result = loop lex_buffer [] in - List.rev result - |> show_token_list - |> print_endline; + let lexer_tests = [ + ("Float, no leading number", ".52", [FLOAT 0.52]); + ("Float, simple", "23.52", [FLOAT 23.52]); + ("Float with e & E", "2.1e2 2.1E2", [FLOAT 210.; SPACE; FLOAT 210.]); + ("Int of float", "42", [INT_OR_FLOAT "42"]); + ("Hex/Int", "0x10", [INT 16]); + ("identifer name in an object", "{hj: 42}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; SPACE; INT_OR_FLOAT "42"; CLOSE_BRACE]); + ] in + + let _ = List.map check_json lexer_tests in + () From d7cf1c609762bd57b77eab39a4f38d9d408b5af1 Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Thu, 29 Oct 2020 15:06:30 +0100 Subject: [PATCH 04/24] Add sedlex --- lib/json5/dune | 8 +++-- lib/json5/lexer.ml | 77 ++++++++++++++++++++++++++++++++++++++++ lib/json5/lexer.mll | 62 -------------------------------- lib/json5/types.ml | 3 +- test/json5/dune | 2 +- test/json5/json5_test.ml | 23 +++++------- yojson_json5.opam | 1 + 7 files changed, 94 insertions(+), 82 deletions(-) create mode 100644 lib/json5/lexer.ml delete mode 100644 lib/json5/lexer.mll diff --git a/lib/json5/dune b/lib/json5/dune index c1a0c07f..740650ff 100644 --- a/lib/json5/dune +++ b/lib/json5/dune @@ -1,6 +1,8 @@ -(ocamllex lexer) - (library (name yojson_json5) (public_name yojson_json5) - (preprocess (pps ppx_deriving.show))) + (libraries sedlex) + (preprocess + (pps ppx_deriving.show sedlex.ppx ppx_deriving.eq) + ) +) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml new file mode 100644 index 00000000..c4db5cf2 --- /dev/null +++ b/lib/json5/lexer.ml @@ -0,0 +1,77 @@ +open Types + +(* From https://www.ecma-international.org/ecma-262/5.1/#sec-7 *) + +(* +let digit = [%sedlex.regexp? '0'..'9'] +let number = [%sedlex.regexp? Plus digit] +*) + +(* NUMBERS, 7.8.3 *) +let non_zero_digit = [%sedlex.regexp? '1'..'9'] +let decimal_digit = [%sedlex.regexp? '0'..'9'] +let decimal_digits = [%sedlex.regexp? Plus decimal_digit] +let hex_digit = [%sedlex.regexp? '0'..'9'|'a'..'f'|'A'..'F'] +let exponent_indicator = [%sedlex.regexp? 'e'|'E'] +let signed_integer = [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits] +let exponent_part = [%sedlex.regexp? exponent_indicator, signed_integer] +let decimal_integer_literal = [%sedlex.regexp? '0' | non_zero_digit, Opt decimal_digits] +let hex_integer_literal = [%sedlex.regexp? "0x", Plus hex_digit | "0X", Plus hex_digit] +(* float *) +let float_literal = [%sedlex.regexp? decimal_integer_literal, '.', Opt decimal_digits, Opt exponent_part | '.', decimal_digits, Opt exponent_part] +let json5_float = [%sedlex.regexp? float_literal | '+', float_literal | '-', float_literal] +(* int_or_float *) +let int_or_float_literal = [%sedlex.regexp? decimal_integer_literal, Opt exponent_part] +let json5_int_or_float = [%sedlex.regexp? int_or_float_literal | '+', int_or_float_literal | '-', int_or_float_literal] +(* int/hex *) +let json5_int = [%sedlex.regexp? hex_integer_literal | '+', hex_integer_literal | '-', hex_integer_literal] + +(* IDENTIFIER_NAME (keys in objects) *) +let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] +let unicode_combining_mark =[%sedlex.regexp? mn | mc] +let unicode_digit = [%sedlex.regexp? nd] +let unicode_connector_punctuation = [%sedlex.regexp? pc] +let unicode_letter = [%sedlex.regexp? lu | ll | lt | lm | lo | nl] +let zwnj = [%sedlex.regexp? 0x200C] +let zwj = [%sedlex.regexp? 0x200D] +let identifier_start = [%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unicode_escape_sequence] +let identifier_part = [%sedlex.regexp? identifier_start | unicode_combining_mark | unicode_digit | unicode_connector_punctuation | zwnj | zwj] +let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part] + +let lex_next buf = + let lexeme = Sedlexing.Utf8.lexeme in + match%sedlex buf with + | '{' -> OPEN_BRACE + | '}' -> CLOSE_BRACE + | '[' -> OPEN_BRACKET + | ']' -> CLOSE_BRACKET + | ':' -> COLON + | ',' -> COMMA + | ' ' -> SPACE + | "true" -> TRUE + | "false" -> FALSE + | "null" -> NULL + | json5_float -> + let s = float_of_string @@ lexeme buf in + FLOAT s + | json5_int_or_float -> + let s = lexeme buf in + INT_OR_FLOAT s + | json5_int -> + let s = int_of_string @@ lexeme buf in + INT s + | identifier_name -> + let s = lexeme buf in + IDENTIFIER_NAME s + | eof -> EOF + | _ -> + let s = lexeme buf in + failwith @@ "Unexpected character: '" ^ s ^ "'" + +let lex buf = + let rec loop xs buf = + match lex_next buf with + | EOF -> xs + | token -> loop (token::xs) buf + in + List.rev @@ loop [] buf diff --git a/lib/json5/lexer.mll b/lib/json5/lexer.mll deleted file mode 100644 index 7435638b..00000000 --- a/lib/json5/lexer.mll +++ /dev/null @@ -1,62 +0,0 @@ -{ - open Types -} - -(* From https://www.ecma-international.org/ecma-262/5.1/#sec-7 *) - -(* NUMBERS, 7.8.3 *) -let non_zero_digit = ['1'-'9'] -let decimal_digit = ['0'-'9'] -let decimal_digits = decimal_digit+ -let hex_digit = [ '0'-'9' 'a'-'f' 'A'-'F' ] -let exponent_indicator = ( 'e' | 'E' ) -let signed_integer = ( decimal_digits | '+' decimal_digits | '-' decimal_digits ) -let exponent_part = exponent_indicator signed_integer -let decimal_integer_literal = ( '0' | non_zero_digit decimal_digits? ) -let hex_integer_literal = ( "0x" hex_digit+ | "0X" hex_digit+ ) -(* float *) -let float_literal = ( decimal_integer_literal '.' decimal_digits? exponent_part? | '.' decimal_digits exponent_part? ) -let json5_float = ( float_literal | '+' float_literal | '-' float_literal ) -(* int_or_float *) -let int_or_float_literal = decimal_integer_literal exponent_part? -let json5_int_or_float = ( int_or_float_literal | '+' int_or_float_literal | '-' int_or_float_literal ) -(* int/hex *) -let json5_int = ( hex_integer_literal | '+' hex_integer_literal | '-' hex_integer_literal ) - -(* IDENTIFIER_NAME (keys in objects) *) -let unicode_escape_squence = 'u' hex_digit hex_digit hex_digit hex_digit -let unicode_letter = [ 'a'-'z' 'A'-'F' ] -let identifier_start = ( unicode_letter | '$' | '_' | '\\' unicode_escape_squence ) -let identifier_part = ( identifier_start | decimal_digits ) (* unicode_combining_mark, unicode_connector_punctuation, ZWNJ and NWJ missing *) -let identifier_name = identifier_start identifier_part+? - - -(* STRINGS, 7.8.4 *) - -rule read_token = parse - | "{" { OPEN_BRACE } - | "}" { CLOSE_BRACE } - | "[" { OPEN_BRACKET } - | "]" { CLOSE_BRACKET } - | ":" { COLON } - | "," { COMMA } - | "true" { TRUE } - | "false" { FALSE } - | "null" { NULL } - | " " { SPACE } - | json5_float { - let s = float_of_string @@ Lexing.lexeme lexbuf in - FLOAT s - } - | json5_int_or_float { - let s = Lexing.lexeme lexbuf in - INT_OR_FLOAT s - } - | json5_int { - let s = int_of_string @@ Lexing.lexeme lexbuf in - INT s - } - | identifier_name { - let s = Lexing.lexeme lexbuf in - IDENTIFIER_NAME s - } diff --git a/lib/json5/types.ml b/lib/json5/types.ml index b2742fe5..6ac1e303 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -13,5 +13,6 @@ type token = | INT_OR_FLOAT of string | INT of int | IDENTIFIER_NAME of string - [@@deriving show] + | EOF + [@@deriving show, eq] diff --git a/test/json5/dune b/test/json5/dune index d9b12ce2..681a96e8 100644 --- a/test/json5/dune +++ b/test/json5/dune @@ -1,4 +1,4 @@ (executable (name json5_test) (libraries yojson_json5) - (preprocess (pps ppx_deriving.show))) + (preprocess (pps ppx_deriving.show ppx_deriving.eq))) diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index eed0cbe2..76b306f2 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -1,25 +1,18 @@ module Lexer = Yojson_json5.Lexer open Yojson_json5.Types -type token_list = token list [@@deriving show] - +type token_list = token list [@@deriving show, eq] let () = let check_json (name, json_string, expected) = - let lex_buffer = Lexing.from_string json_string in + let buf = Sedlexing.Utf8.from_string json_string in + let result = Lexer.lex buf in - let rec loop lex_buf token_list = - match Lexer.read_token lex_buf with - | exception Failure _ -> token_list - | token -> loop lex_buf (token::token_list) - in - let result = loop lex_buffer [] - |> List.rev - |> show_token_list - in - let expected = show_token_list expected in - if result <> expected then - print_string @@ name ^ " failed:\n\nInput:\n" ^ result ^ "\n\nExpected:\n" ^ expected + match equal_token_list result expected with + | true -> () + | false -> + let s = Format.asprintf "%s failed:\n\nInput:\n%a\n\nExpected:\n%a\n" name pp_token_list result pp_token_list expected in + print_string s in let lexer_tests = [ diff --git a/yojson_json5.opam b/yojson_json5.opam index 64dd13f2..cff6b681 100644 --- a/yojson_json5.opam +++ b/yojson_json5.opam @@ -13,6 +13,7 @@ run-test: [["dune" "runtest" "-p" name "-j" jobs]] depends: [ "ocaml" {>= "4.02.3"} "dune" + "sedlex" "alcotest" {with-test & >= "0.8.5"} ] synopsis: From 7cb17ba089c9bef7bd03d431cc6ec88d84c78d67 Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Thu, 29 Oct 2020 16:39:02 +0100 Subject: [PATCH 05/24] add support for comments --- lib/json5/lexer.ml | 66 ++++++++++++++++++++++------------------ lib/json5/types.ml | 2 -- test/json5/json5_test.ml | 16 +++++++--- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index c4db5cf2..ddf65a77 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -38,40 +38,48 @@ let identifier_start = [%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unico let identifier_part = [%sedlex.regexp? identifier_start | unicode_combining_mark | unicode_digit | unicode_connector_punctuation | zwnj | zwj] let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part] -let lex_next buf = +(* COMMENTS 7.4 *) +let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] +let source_character = [%sedlex.regexp? any] +let single_line_comment_char = [%sedlex.regexp? Sub (source_character, line_terminator)] +let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char] +let multi_line_not_forward_slash_or_asterisk_char = [%sedlex.regexp? Sub (source_character, (Chars "*/"))] +let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')] +let post_asterisk_comment_char = [%sedlex.regexp? Opt '*', multi_line_not_forward_slash_or_asterisk_char] +let multi_line_comment_char = [%sedlex.regexp? multi_line_not_asterisk_char | '*', Star post_asterisk_comment_char] +let multi_line_comment = [%sedlex.regexp? "/*", Star multi_line_comment_char, "*/"] +let comment = [%sedlex.regexp? multi_line_comment | single_line_comment] + +let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] + +let rec lex tokens buf = let lexeme = Sedlexing.Utf8.lexeme in match%sedlex buf with - | '{' -> OPEN_BRACE - | '}' -> CLOSE_BRACE - | '[' -> OPEN_BRACKET - | ']' -> CLOSE_BRACKET - | ':' -> COLON - | ',' -> COMMA - | ' ' -> SPACE - | "true" -> TRUE - | "false" -> FALSE - | "null" -> NULL + | '{' -> lex (OPEN_BRACE::tokens) buf + | '}' -> lex (CLOSE_BRACE::tokens) buf + | '[' -> lex (OPEN_BRACKET::tokens) buf + | ']' -> lex (CLOSE_BRACKET::tokens) buf + | ':' -> lex (COLON::tokens) buf + | ',' -> lex (COMMA::tokens) buf + | comment + | white_space + | line_terminator -> lex tokens buf + | "true" -> lex (TRUE::tokens) buf + | "false" -> lex (FALSE::tokens) buf + | "null" -> lex (NULL::tokens) buf | json5_float -> let s = float_of_string @@ lexeme buf in - FLOAT s + lex (FLOAT s::tokens) buf | json5_int_or_float -> - let s = lexeme buf in - INT_OR_FLOAT s + let s = lexeme buf in + lex (INT_OR_FLOAT s::tokens) buf | json5_int -> - let s = int_of_string @@ lexeme buf in - INT s + let s = int_of_string @@ lexeme buf in + lex (INT s::tokens) buf | identifier_name -> - let s = lexeme buf in - IDENTIFIER_NAME s - | eof -> EOF + let s = lexeme buf in + lex (IDENTIFIER_NAME s::tokens) buf + | eof -> List.rev tokens | _ -> - let s = lexeme buf in - failwith @@ "Unexpected character: '" ^ s ^ "'" - -let lex buf = - let rec loop xs buf = - match lex_next buf with - | EOF -> xs - | token -> loop (token::xs) buf - in - List.rev @@ loop [] buf + let s = lexeme buf in + failwith @@ "Unexpected character: '" ^ s ^ "'" diff --git a/lib/json5/types.ml b/lib/json5/types.ml index 6ac1e303..2cb74762 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -8,11 +8,9 @@ type token = | TRUE | FALSE | NULL - | SPACE | FLOAT of float | INT_OR_FLOAT of string | INT of int | IDENTIFIER_NAME of string - | EOF [@@deriving show, eq] diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index 76b306f2..8fdd2f0e 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -6,23 +6,31 @@ type token_list = token list [@@deriving show, eq] let () = let check_json (name, json_string, expected) = let buf = Sedlexing.Utf8.from_string json_string in - let result = Lexer.lex buf in + let result = Lexer.lex [] buf in match equal_token_list result expected with | true -> () | false -> - let s = Format.asprintf "%s failed:\n\nInput:\n%a\n\nExpected:\n%a\n" name pp_token_list result pp_token_list expected in + let s = Format.asprintf "%s failed:\n\nOutput:\n%a\n\nExpected:\n%a\n" name pp_token_list result pp_token_list expected in print_string s in let lexer_tests = [ ("Float, no leading number", ".52", [FLOAT 0.52]); ("Float, simple", "23.52", [FLOAT 23.52]); - ("Float with e & E", "2.1e2 2.1E2", [FLOAT 210.; SPACE; FLOAT 210.]); + ("Float with e & E", "2.1e2 2.1E2", [FLOAT 210.; FLOAT 210.]); ("Int of float", "42", [INT_OR_FLOAT "42"]); ("Hex/Int", "0x10", [INT 16]); - ("identifer name in an object", "{hj: 42}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; SPACE; INT_OR_FLOAT "42"; CLOSE_BRACE]); + ("identifer name in an object", "{hj: 42}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE]); + ("multi line comment", "{hj: 42 /* hello\nsatan */}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE]); + ("single line comment", "{//foo\na: 1}", [OPEN_BRACE; IDENTIFIER_NAME "a"; COLON; INT_OR_FLOAT "1"; CLOSE_BRACE]); + ("comment from hell 1", "/**/1", [INT_OR_FLOAT "1"]); + ("comment from hell 2", "/*/*/1", [INT_OR_FLOAT "1"]); + ("comment from hell 3", "/***/1", [INT_OR_FLOAT "1"]); + ("comment from hell 4", "/*//hell\n*/1", [INT_OR_FLOAT "1"]); + (* ("comment from hell 5", "/* */[\"aa\",/* \"don't\" */ \"show */\"]", [IDENTIFIER_NAME "aa"]); *) ] in let _ = List.map check_json lexer_tests in () + From b1dc1bf9d0ea15fa4de770d76c8cbca73fca843b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gert=20S=C3=B8nderby?= Date: Fri, 30 Oct 2020 13:38:45 +0100 Subject: [PATCH 06/24] Add ppx_deriving dependency --- yojson_json5.opam | 1 + 1 file changed, 1 insertion(+) diff --git a/yojson_json5.opam b/yojson_json5.opam index cff6b681..712d4c4d 100644 --- a/yojson_json5.opam +++ b/yojson_json5.opam @@ -15,6 +15,7 @@ depends: [ "dune" "sedlex" "alcotest" {with-test & >= "0.8.5"} + "ppx_deriving" ] synopsis: "Yojson is an optimized parsing and printing library for the JSON format" From 3b56fd684f62dbb5d9d37264aab3a6cd59bacca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gert=20S=C3=B8nderby?= Date: Fri, 30 Oct 2020 14:45:56 +0100 Subject: [PATCH 07/24] Test file for JSON5 lexing --- lib/json5/lexer.ml | 7 ++-- test/json5/dune | 5 ++- test/json5/json5_test.ml | 81 +++++++++++++++++++++++++--------------- 3 files changed, 56 insertions(+), 37 deletions(-) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index ddf65a77..6d1df9a7 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -43,11 +43,10 @@ let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] let source_character = [%sedlex.regexp? any] let single_line_comment_char = [%sedlex.regexp? Sub (source_character, line_terminator)] let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char] -let multi_line_not_forward_slash_or_asterisk_char = [%sedlex.regexp? Sub (source_character, (Chars "*/"))] let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')] -let post_asterisk_comment_char = [%sedlex.regexp? Opt '*', multi_line_not_forward_slash_or_asterisk_char] -let multi_line_comment_char = [%sedlex.regexp? multi_line_not_asterisk_char | '*', Star post_asterisk_comment_char] -let multi_line_comment = [%sedlex.regexp? "/*", Star multi_line_comment_char, "*/"] +let multi_line_not_slash_char = [%sedlex.regexp? Sub (source_character, '/')] +let multi_line_comment_char = [%sedlex.regexp? multi_line_not_asterisk_char | '*', Plus multi_line_not_slash_char] +let multi_line_comment = [%sedlex.regexp? "/*", Star multi_line_comment_char, Opt '*', "*/"] let comment = [%sedlex.regexp? multi_line_comment | single_line_comment] let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] diff --git a/test/json5/dune b/test/json5/dune index 681a96e8..85583f9a 100644 --- a/test/json5/dune +++ b/test/json5/dune @@ -1,4 +1,5 @@ (executable (name json5_test) - (libraries yojson_json5) - (preprocess (pps ppx_deriving.show ppx_deriving.eq))) + (libraries yojson_json5 alcotest) + (preprocess + (pps ppx_deriving.show ppx_deriving.eq))) diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index 8fdd2f0e..8678922f 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -1,36 +1,55 @@ module Lexer = Yojson_json5.Lexer open Yojson_json5.Types -type token_list = token list [@@deriving show, eq] +let tokenize_json5 (json_string) = + let buf = Sedlexing.Utf8.from_string json_string in + Lexer.lex [] buf -let () = - let check_json (name, json_string, expected) = - let buf = Sedlexing.Utf8.from_string json_string in - let result = Lexer.lex [] buf in - - match equal_token_list result expected with - | true -> () - | false -> - let s = Format.asprintf "%s failed:\n\nOutput:\n%a\n\nExpected:\n%a\n" name pp_token_list result pp_token_list expected in - print_string s - in - - let lexer_tests = [ - ("Float, no leading number", ".52", [FLOAT 0.52]); - ("Float, simple", "23.52", [FLOAT 23.52]); - ("Float with e & E", "2.1e2 2.1E2", [FLOAT 210.; FLOAT 210.]); - ("Int of float", "42", [INT_OR_FLOAT "42"]); - ("Hex/Int", "0x10", [INT 16]); - ("identifer name in an object", "{hj: 42}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE]); - ("multi line comment", "{hj: 42 /* hello\nsatan */}", [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE]); - ("single line comment", "{//foo\na: 1}", [OPEN_BRACE; IDENTIFIER_NAME "a"; COLON; INT_OR_FLOAT "1"; CLOSE_BRACE]); - ("comment from hell 1", "/**/1", [INT_OR_FLOAT "1"]); - ("comment from hell 2", "/*/*/1", [INT_OR_FLOAT "1"]); - ("comment from hell 3", "/***/1", [INT_OR_FLOAT "1"]); - ("comment from hell 4", "/*//hell\n*/1", [INT_OR_FLOAT "1"]); - (* ("comment from hell 5", "/* */[\"aa\",/* \"don't\" */ \"show */\"]", [IDENTIFIER_NAME "aa"]); *) - ] in - - let _ = List.map check_json lexer_tests in - () +let token = Alcotest.testable pp_token equal_token + +let test_float () = + Alcotest.(check (list token)) "Simple" [FLOAT 23.52] (tokenize_json5 "23.52"); + Alcotest.(check (list token)) "No leading number" [FLOAT 0.52] (tokenize_json5 ".52"); + Alcotest.(check (list token)) "With exponent" [FLOAT 210.; FLOAT 210.] (tokenize_json5 "2.1e2 2.1E2") + +let test_int_or_float () = + Alcotest.(check (list token)) "Int or float" [INT_OR_FLOAT "42"] (tokenize_json5 "42") + +let test_int () = + Alcotest.(check (list token)) "Hex/Int" [INT 16] (tokenize_json5 "0x10") +let test_identifier () = + Alcotest.(check (list token)) + "Identifer name in an object" + [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE] + (tokenize_json5 "{hj: 42}") + +let test_multi_line_comments () = + Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "/* hello\nworld */"); + Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/* hello\nworld */1"); + Alcotest.(check (list token)) "Empty" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/**/1"); + Alcotest.(check (list token)) "Contains slash" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/*/*/1"); + Alcotest.(check (list token)) "Contains asterisk" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/***/1"); + Alcotest.(check (list token)) "Contains double asterisk" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/****/1"); + Alcotest.check_raises "Contains comment end" (Failure "Unexpected character: ''") (fun () -> ignore @@ tokenize_json5 "/* */ */") + +let test_single_line_comments () = + Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "//foo\n"); + Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1//foo\n1") + +let () = + let open Alcotest in + run "JSON5" [ + "Numbers", [ + test_case "Float" `Quick test_float; + test_case "Int or float" `Quick test_int_or_float; + test_case "Int" `Quick test_int; + ]; + "Objects", [ + test_case "Identifiers" `Quick test_identifier; + ]; + "Comments", [ + test_case "Multi-line comments" `Quick test_multi_line_comments; + test_case "Single-line comments" `Quick test_single_line_comments; + ]; + ] From 03eefbd6b7f1c6595acb523b2cccb57dac1126ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gert=20S=C3=B8nderby?= Date: Fri, 30 Oct 2020 15:58:02 +0100 Subject: [PATCH 08/24] Basic strings, escape seqs --- lib/json5/lexer.ml | 22 +++++++++++++++++++--- lib/json5/types.ml | 1 + test/json5/json5_test.ml | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 6d1df9a7..4b6d5b03 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -6,6 +6,9 @@ open Types let digit = [%sedlex.regexp? '0'..'9'] let number = [%sedlex.regexp? Plus digit] *) +let source_character = [%sedlex.regexp? any] +let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] +let line_terminator_sequence = [%sedlex.regexp? 0x000A | 0x000D, Opt 0x000A | 0x2028 | 0x2029] (* NUMBERS, 7.8.3 *) let non_zero_digit = [%sedlex.regexp? '1'..'9'] @@ -26,8 +29,21 @@ let json5_int_or_float = [%sedlex.regexp? int_or_float_literal | '+', int_or_flo (* int/hex *) let json5_int = [%sedlex.regexp? hex_integer_literal | '+', hex_integer_literal | '-', hex_integer_literal] -(* IDENTIFIER_NAME (keys in objects) *) +(* STRING LITERALS, 7.8.4 *) let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] +let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}] +let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u' ] +let non_escape_character = [%sedlex.regexp? Sub (source_character, ( escape_character | line_terminator ) ) ] +let character_escape_sequence = [%sedlex.regexp? single_escape_character | non_escape_character ] +let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence ] +let escape_sequence = [%sedlex.regexp? unicode_escape_sequence | character_escape_sequence ] +let single_string_character = [%sedlex.regexp? Sub (source_character, ('\'' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] +let double_string_character = [%sedlex.regexp? Sub (source_character, ('"' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] +let string_literal = [%sedlex.regexp? '"', Star double_string_character, '"' | '\'', Star single_string_character, '\'' ] + + + +(* IDENTIFIER_NAME (keys in objects) *) let unicode_combining_mark =[%sedlex.regexp? mn | mc] let unicode_digit = [%sedlex.regexp? nd] let unicode_connector_punctuation = [%sedlex.regexp? pc] @@ -39,8 +55,6 @@ let identifier_part = [%sedlex.regexp? identifier_start | unicode_combining_mark let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part] (* COMMENTS 7.4 *) -let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] -let source_character = [%sedlex.regexp? any] let single_line_comment_char = [%sedlex.regexp? Sub (source_character, line_terminator)] let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char] let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')] @@ -66,6 +80,8 @@ let rec lex tokens buf = | "true" -> lex (TRUE::tokens) buf | "false" -> lex (FALSE::tokens) buf | "null" -> lex (NULL::tokens) buf + | string_literal -> let s = lexeme buf in + lex (STRING s::tokens) buf | json5_float -> let s = float_of_string @@ lexeme buf in lex (FLOAT s::tokens) buf diff --git a/lib/json5/types.ml b/lib/json5/types.ml index 2cb74762..3d46352a 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -11,6 +11,7 @@ type token = | FLOAT of float | INT_OR_FLOAT of string | INT of int + | STRING of string | IDENTIFIER_NAME of string [@@deriving show, eq] diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index 8678922f..dccdd3e7 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -18,6 +18,21 @@ let test_int_or_float () = let test_int () = Alcotest.(check (list token)) "Hex/Int" [INT 16] (tokenize_json5 "0x10") +let test_string () = + Alcotest.(check (list token)) "Doublequoted simple" [STRING "\"hello\""] (tokenize_json5 "\"hello\""); + Alcotest.(check (list token)) "Doublequoted single-character escape sequence" [STRING {|"\'\"\\\b\f\n\r\t\v"|}] (tokenize_json5 {|"\'\"\\\b\f\n\r\t\v"|}); + Alcotest.(check (list token)) "Doublequoted non-escape-character escape sequence" [STRING {|"\z"|}] (tokenize_json5 {|"\z"|}); + Alcotest.(check (list token)) "Doublequoted zero escape sequence" [STRING {|"\0"|}] (tokenize_json5 {|"\0"|}); + Alcotest.(check (list token)) "Doublequoted unicode escape" [STRING "\"\\uD83D\\uDC2A\""] (tokenize_json5 "\"\\uD83D\\uDC2A\""); + Alcotest.(check (list token)) "Doublequoted line continuation" [STRING "\"hel\\\nlo\""] (tokenize_json5 "\"hel\\\nlo\""); + Alcotest.(check (list token)) "Singlequoted simple" [STRING "'hello'"] (tokenize_json5 "'hello'"); + Alcotest.(check (list token)) "Singlequoted single-character escape sequence" [STRING {|'\'\"\\\b\f\n\r\t\v'|}] (tokenize_json5 {|'\'\"\\\b\f\n\r\t\v'|}); + Alcotest.(check (list token)) "Singlequoted non-escape-character escape sequence" [STRING {|'\z'|}] (tokenize_json5 {|'\z'|}); + Alcotest.(check (list token)) "Singlequoted zero escape sequence" [STRING {|'\0'|}] (tokenize_json5 {|'\0'|}); + Alcotest.(check (list token)) "Singlequoted unicode escape" [STRING "'\\uD83D\\uDC2A'"] (tokenize_json5 "'\\uD83D\\uDC2A'"); + Alcotest.(check (list token)) "Singlequoted line continuation" [STRING "'hel\\\nlo'"] (tokenize_json5 "'hel\\\nlo'"); + () + let test_identifier () = Alcotest.(check (list token)) "Identifer name in an object" @@ -45,6 +60,9 @@ let () = test_case "Int or float" `Quick test_int_or_float; test_case "Int" `Quick test_int; ]; + "Strings", [ + test_case "String" `Quick test_string; + ]; "Objects", [ test_case "Identifiers" `Quick test_identifier; ]; From a5931f9d303e218f5c31a90f49a5048963c9b402 Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Thu, 5 Nov 2020 14:38:24 +0100 Subject: [PATCH 09/24] skeleton of parser --- lib/json5/dune | 2 +- lib/json5/lexer.ml | 2 +- lib/json5/parser.ml | 49 ++++++++++++++++++++++++++++++++++++++++ lib/json5/types.ml | 1 + test/json5/json5_test.ml | 4 +++- 5 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 lib/json5/parser.ml diff --git a/lib/json5/dune b/lib/json5/dune index 740650ff..63b082c9 100644 --- a/lib/json5/dune +++ b/lib/json5/dune @@ -1,7 +1,7 @@ (library (name yojson_json5) (public_name yojson_json5) - (libraries sedlex) + (libraries yojson sedlex) (preprocess (pps ppx_deriving.show sedlex.ppx ppx_deriving.eq) ) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 4b6d5b03..859fb1d4 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -36,7 +36,7 @@ let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit let non_escape_character = [%sedlex.regexp? Sub (source_character, ( escape_character | line_terminator ) ) ] let character_escape_sequence = [%sedlex.regexp? single_escape_character | non_escape_character ] let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence ] -let escape_sequence = [%sedlex.regexp? unicode_escape_sequence | character_escape_sequence ] +let escape_sequence = [%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence ] (* TODO *) let single_string_character = [%sedlex.regexp? Sub (source_character, ('\'' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] let double_string_character = [%sedlex.regexp? Sub (source_character, ('"' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] let string_literal = [%sedlex.regexp? '"', Star double_string_character, '"' | '\'', Star single_string_character, '\'' ] diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml new file mode 100644 index 00000000..7178748e --- /dev/null +++ b/lib/json5/parser.ml @@ -0,0 +1,49 @@ +open Types + +let escape_string x = x + +let rec parse_list acc = function + | [] -> failwith "List never ends" + | CLOSE_BRACKET::_ -> acc + | x::COMMA::xs -> + let acc = (parse [x])::acc in + parse_list acc xs + | x::CLOSE_BRACKET::_ -> + (parse [x])::acc + | x::_ -> + let s = Format.asprintf "Unexpected list token: %a" pp_token x in + failwith s + +and parse_assoc acc = function + | [] -> failwith "Assoc never ends" + | CLOSE_BRACE::_ -> acc + | (STRING k)::COLON::v::COMMA::xs + | (IDENTIFIER_NAME k)::COLON::v::COMMA::xs -> + let item = (k, parse [v]) in + parse_assoc (item::acc) xs + | (STRING k)::COLON::v::CLOSE_BRACE::_xs + | (IDENTIFIER_NAME k)::COLON::v::CLOSE_BRACE::_xs -> + (k, parse [v])::acc + | x::_ -> + let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in + failwith s + +and parse : token list -> t = function + | [] -> failwith "empty list of tokens" + | token::xs -> + match token with + | TRUE -> `Bool true + | FALSE -> `Bool false + | NULL -> `Null + | INT v -> `Int v + | FLOAT v -> `Float v + | INT_OR_FLOAT v -> `String v + | STRING s -> `String (escape_string s) + | OPEN_BRACKET -> `List (parse_list [] xs) + | OPEN_BRACE -> `Assoc (parse_assoc [] xs) + | x -> + let s = Format.asprintf "Unexpected token: %a" pp_token x in + failwith s + + + diff --git a/lib/json5/types.ml b/lib/json5/types.ml index 3d46352a..ac98404d 100644 --- a/lib/json5/types.ml +++ b/lib/json5/types.ml @@ -15,3 +15,4 @@ type token = | IDENTIFIER_NAME of string [@@deriving show, eq] +type t = Yojson.Safe.t diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index dccdd3e7..e633555c 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -21,8 +21,9 @@ let test_int () = let test_string () = Alcotest.(check (list token)) "Doublequoted simple" [STRING "\"hello\""] (tokenize_json5 "\"hello\""); Alcotest.(check (list token)) "Doublequoted single-character escape sequence" [STRING {|"\'\"\\\b\f\n\r\t\v"|}] (tokenize_json5 {|"\'\"\\\b\f\n\r\t\v"|}); - Alcotest.(check (list token)) "Doublequoted non-escape-character escape sequence" [STRING {|"\z"|}] (tokenize_json5 {|"\z"|}); + Alcotest.(check (list token)) "Doublequoted non-escape-character escape sequence" [STRING {|"foo\z"|}] (tokenize_json5 {|"foo\z"|}); Alcotest.(check (list token)) "Doublequoted zero escape sequence" [STRING {|"\0"|}] (tokenize_json5 {|"\0"|}); + (* Alcotest.check_raises "Doublequoted zero then one escape sequence" (Failure "Unexpected character: ''") (fun () -> ignore @@ tokenize_json5 {|"\01"|}); *) Alcotest.(check (list token)) "Doublequoted unicode escape" [STRING "\"\\uD83D\\uDC2A\""] (tokenize_json5 "\"\\uD83D\\uDC2A\""); Alcotest.(check (list token)) "Doublequoted line continuation" [STRING "\"hel\\\nlo\""] (tokenize_json5 "\"hel\\\nlo\""); Alcotest.(check (list token)) "Singlequoted simple" [STRING "'hello'"] (tokenize_json5 "'hello'"); @@ -31,6 +32,7 @@ let test_string () = Alcotest.(check (list token)) "Singlequoted zero escape sequence" [STRING {|'\0'|}] (tokenize_json5 {|'\0'|}); Alcotest.(check (list token)) "Singlequoted unicode escape" [STRING "'\\uD83D\\uDC2A'"] (tokenize_json5 "'\\uD83D\\uDC2A'"); Alcotest.(check (list token)) "Singlequoted line continuation" [STRING "'hel\\\nlo'"] (tokenize_json5 "'hel\\\nlo'"); + (* Alcotest.(check (list token)) "Singlequoted one escape sequence" [STRING {|'\1'|}] (tokenize_json5 {|'\1'|}); *) () let test_identifier () = From 5a12eb278c1d3da6e38f2c84a61fe8f863b1bf37 Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Thu, 5 Nov 2020 15:52:23 +0100 Subject: [PATCH 10/24] implement parsing of lists --- lib/json5/parser.ml | 63 +++++++++++++++++++++++++--------------- test/json5/json5_test.ml | 31 ++++++++++++++++++++ 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index 7178748e..7ba2f535 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -4,43 +4,58 @@ let escape_string x = x let rec parse_list acc = function | [] -> failwith "List never ends" - | CLOSE_BRACKET::_ -> acc - | x::COMMA::xs -> - let acc = (parse [x])::acc in - parse_list acc xs - | x::CLOSE_BRACKET::_ -> - (parse [x])::acc - | x::_ -> - let s = Format.asprintf "Unexpected list token: %a" pp_token x in - failwith s + | CLOSE_BRACKET::xs + | COMMA::CLOSE_BRACKET::xs -> (acc, xs) + | xs -> ( + let (v, xs) = parse xs in + match xs with + | [] -> failwith "List was not closed" + | CLOSE_BRACKET::xs + | COMMA::CLOSE_BRACKET::xs -> (v::acc, xs) + | COMMA::xs -> parse_list (v::acc) xs + | x::_ -> + let s = Format.asprintf "Unexpected list token: %a" pp_token x in + failwith s) +(* and parse_assoc acc = function | [] -> failwith "Assoc never ends" - | CLOSE_BRACE::_ -> acc + | CLOSE_BRACE::xs + | COMMA::CLOSE_BRACE::xs -> (acc, xs) | (STRING k)::COLON::v::COMMA::xs | (IDENTIFIER_NAME k)::COLON::v::COMMA::xs -> - let item = (k, parse [v]) in + let (v, xs) = parse [v] in + let item = (k, v) in parse_assoc (item::acc) xs - | (STRING k)::COLON::v::CLOSE_BRACE::_xs - | (IDENTIFIER_NAME k)::COLON::v::CLOSE_BRACE::_xs -> - (k, parse [v])::acc + | (STRING k)::COLON::v::CLOSE_BRACE::xs + | (IDENTIFIER_NAME k)::COLON::v::CLOSE_BRACE::xs -> + let (v, _) = parse [x] + let acc = (k, parse [v])::acc in + (acc, xs) | x::_ -> let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in failwith s +*) -and parse : token list -> t = function +and parse : token list -> (t * token list) = function | [] -> failwith "empty list of tokens" | token::xs -> match token with - | TRUE -> `Bool true - | FALSE -> `Bool false - | NULL -> `Null - | INT v -> `Int v - | FLOAT v -> `Float v - | INT_OR_FLOAT v -> `String v - | STRING s -> `String (escape_string s) - | OPEN_BRACKET -> `List (parse_list [] xs) - | OPEN_BRACE -> `Assoc (parse_assoc [] xs) + | TRUE -> (`Bool true, xs) + | FALSE -> (`Bool false, xs) + | NULL -> (`Null, xs) + | INT v -> (`Int v, xs) + | FLOAT v -> (`Float v, xs) + | INT_OR_FLOAT v -> (`String v, xs) + | STRING s -> (`String (escape_string s), xs) + | OPEN_BRACKET -> + let (l, xs) = parse_list [] xs in + (`List (List.rev l), xs) + (* + | OPEN_BRACE -> + let (a, xs) = parse_assoc [] xs in + (`Assoc (List.rev a), xs) + *) | x -> let s = Format.asprintf "Unexpected token: %a" pp_token x in failwith s diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index e633555c..b87fac50 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -54,6 +54,33 @@ let test_single_line_comments () = Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "//foo\n"); Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1//foo\n1") +(** + * PARSING + *) + +let yojson = Alcotest.testable Yojson.Safe.pp Yojson.Safe.equal + +let parse tokens = + let (json, _) = Yojson_json5.Parser.parse tokens in + json + +let test_parser_simple () = + Alcotest.(check yojson) "Simple null" `Null (parse [NULL]); + Alcotest.(check yojson) "Simple true" (`Bool true) (parse [TRUE]); + Alcotest.(check yojson) "Simple false" (`Bool false) (parse [FALSE]); + () + +let test_parser_list () = + Alcotest.(check yojson) "Empty list" (`List []) (parse [OPEN_BRACKET; CLOSE_BRACKET]); + Alcotest.(check yojson) "List with bools" (`List [`Bool false; `Bool true]) (parse [OPEN_BRACKET; FALSE; COMMA; TRUE; CLOSE_BRACKET]); + Alcotest.(check yojson) "List of lists" (`List [ `List []; `Null ]) (parse [OPEN_BRACKET; OPEN_BRACKET; CLOSE_BRACKET; COMMA; NULL; CLOSE_BRACKET]); + () + + +(** + * RUN + *) + let () = let open Alcotest in run "JSON5" [ @@ -72,4 +99,8 @@ let () = test_case "Multi-line comments" `Quick test_multi_line_comments; test_case "Single-line comments" `Quick test_single_line_comments; ]; + "Parse", [ + test_case "Simple parsing" `Quick test_parser_simple; + test_case "Simple list parsing" `Quick test_parser_list; + ]; ] From 08fd84771be4a70f596644a58294f56eb57a4b5c Mon Sep 17 00:00:00 2001 From: Gorm Casper Date: Fri, 6 Nov 2020 10:58:49 +0100 Subject: [PATCH 11/24] implement parsing of objects --- lib/json5/parser.ml | 24 +++++++++++------------- test/json5/json5_test.ml | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index 7ba2f535..f3293bac 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -17,25 +17,25 @@ let rec parse_list acc = function let s = Format.asprintf "Unexpected list token: %a" pp_token x in failwith s) -(* and parse_assoc acc = function | [] -> failwith "Assoc never ends" | CLOSE_BRACE::xs | COMMA::CLOSE_BRACE::xs -> (acc, xs) - | (STRING k)::COLON::v::COMMA::xs - | (IDENTIFIER_NAME k)::COLON::v::COMMA::xs -> - let (v, xs) = parse [v] in + | (STRING k)::COLON::xs + | (IDENTIFIER_NAME k)::COLON::xs -> ( + let (v, xs) = parse xs in let item = (k, v) in - parse_assoc (item::acc) xs - | (STRING k)::COLON::v::CLOSE_BRACE::xs - | (IDENTIFIER_NAME k)::COLON::v::CLOSE_BRACE::xs -> - let (v, _) = parse [x] - let acc = (k, parse [v])::acc in - (acc, xs) + match xs with + | [] -> failwith "Object was not closed" + | CLOSE_BRACE::xs + | COMMA::CLOSE_BRACE::xs -> (item::acc, xs) + | COMMA::xs -> parse_assoc (item::acc) xs + | x::_ -> + let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in + failwith s) | x::_ -> let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in failwith s -*) and parse : token list -> (t * token list) = function | [] -> failwith "empty list of tokens" @@ -51,11 +51,9 @@ and parse : token list -> (t * token list) = function | OPEN_BRACKET -> let (l, xs) = parse_list [] xs in (`List (List.rev l), xs) - (* | OPEN_BRACE -> let (a, xs) = parse_assoc [] xs in (`Assoc (List.rev a), xs) - *) | x -> let s = Format.asprintf "Unexpected token: %a" pp_token x in failwith s diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml index b87fac50..be6b6614 100644 --- a/test/json5/json5_test.ml +++ b/test/json5/json5_test.ml @@ -54,6 +54,7 @@ let test_single_line_comments () = Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "//foo\n"); Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1//foo\n1") + (** * PARSING *) @@ -68,12 +69,30 @@ let test_parser_simple () = Alcotest.(check yojson) "Simple null" `Null (parse [NULL]); Alcotest.(check yojson) "Simple true" (`Bool true) (parse [TRUE]); Alcotest.(check yojson) "Simple false" (`Bool false) (parse [FALSE]); + Alcotest.(check yojson) "Simple int" (`Int 3) (parse [INT 3]); + Alcotest.(check yojson) "Simple float" (`Float 3.4) (parse [FLOAT 3.4]); + () + +let test_parser_string () = + Alcotest.(check yojson) "Simple string" (`String "hello") (parse [STRING "hello"]); + Alcotest.(check yojson) "Escape sequences" (`String "a\'\"\\\b\n\r\ta") (parse [STRING "a\'\"\\\b\n\r\ta"]); () let test_parser_list () = Alcotest.(check yojson) "Empty list" (`List []) (parse [OPEN_BRACKET; CLOSE_BRACKET]); Alcotest.(check yojson) "List with bools" (`List [`Bool false; `Bool true]) (parse [OPEN_BRACKET; FALSE; COMMA; TRUE; CLOSE_BRACKET]); Alcotest.(check yojson) "List of lists" (`List [ `List []; `Null ]) (parse [OPEN_BRACKET; OPEN_BRACKET; CLOSE_BRACKET; COMMA; NULL; CLOSE_BRACKET]); + Alcotest.(check yojson) "List with trailing comma" (`List [ `Null ]) (parse [OPEN_BRACKET; NULL; COMMA; CLOSE_BRACKET]); + Alcotest.(check yojson) "List of list with content" (`List [ `List [ `Bool true ] ]) (parse [OPEN_BRACKET; OPEN_BRACKET; TRUE; CLOSE_BRACKET; CLOSE_BRACKET]); + () + +let test_parser_object () = + Alcotest.(check yojson) "Empty object" (`Assoc []) (parse [OPEN_BRACE; CLOSE_BRACE]); + Alcotest.(check yojson) "String key" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; NULL; CLOSE_BRACE]); + Alcotest.(check yojson) "Identifer key" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; IDENTIFIER_NAME "foo"; COLON; NULL; CLOSE_BRACE]); + Alcotest.(check yojson) "Trailing comma" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; NULL; COMMA; CLOSE_BRACE]); + Alcotest.(check yojson) "Nested object" (`Assoc [ ("foo", `Assoc [ ("bar", `Null) ]) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; OPEN_BRACE; STRING "bar"; COLON; NULL; CLOSE_BRACE; COMMA; CLOSE_BRACE]); + Alcotest.(check yojson) "Mixed keys and values" (`Assoc [ ("foo", `Bool true); ("bar", `Null); ("baz", `Bool false) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; TRUE; COMMA; IDENTIFIER_NAME "bar"; COLON; NULL; COMMA; IDENTIFIER_NAME "baz"; COLON; FALSE; CLOSE_BRACE]); () @@ -101,6 +120,8 @@ let () = ]; "Parse", [ test_case "Simple parsing" `Quick test_parser_simple; + test_case "Strings" `Quick test_parser_string; test_case "Simple list parsing" `Quick test_parser_list; + test_case "Simple object parsing" `Quick test_parser_object; ]; ] From 23d62aec3626b1c82fad870a55ae9268fd8e778f Mon Sep 17 00:00:00 2001 From: Daniel Hilst Date: Sun, 25 Sep 2022 09:42:04 -0300 Subject: [PATCH 12/24] Add JSON5 code --- dune-project | 11 ++ lib/json5/ast.ml | 37 ++++++ lib/json5/basic.ml | 7 ++ lib/json5/dune | 6 +- lib/json5/let_syntax.ml | 3 + lib/json5/lexer.ml | 224 +++++++++++++++++++++++++++---------- lib/json5/parser.ml | 132 +++++++++++++--------- lib/json5/read.ml | 34 ++++++ lib/json5/safe.ml | 7 ++ lib/json5/types.ml | 18 --- lib/json5/yojson_json5.ml | 2 + lib/json5/yojson_json5.mli | 71 ++++++++++++ test/json5/dune | 5 - test/json5/json5_test.ml | 127 --------------------- test_json5/dune | 4 + test_json5/test.ml | 92 +++++++++++++++ yojson-json5.opam | 37 ++++++ yojson_json5.opam | 32 ------ 18 files changed, 546 insertions(+), 303 deletions(-) create mode 100644 lib/json5/ast.ml create mode 100644 lib/json5/basic.ml create mode 100644 lib/json5/let_syntax.ml create mode 100644 lib/json5/read.ml create mode 100644 lib/json5/safe.ml delete mode 100644 lib/json5/types.ml create mode 100644 lib/json5/yojson_json5.ml create mode 100644 lib/json5/yojson_json5.mli delete mode 100644 test/json5/dune delete mode 100644 test/json5/json5_test.ml create mode 100644 test_json5/dune create mode 100644 test_json5/test.ml create mode 100644 yojson-json5.opam delete mode 100644 yojson_json5.opam diff --git a/dune-project b/dune-project index 242aac15..f766e1eb 100644 --- a/dune-project +++ b/dune-project @@ -35,3 +35,14 @@ meant for developers that are worried about performance changes in Yojson.") (core (>= v0.14.0)) (core_unix (>= v0.14.0)) (sexplib (>= v0.9.0)))) + +(package + (name yojson-json5) + (synopsis "Yojson_json5 is a parsing and printing library for the JSON5 format") + (description "Yojson_json5 is a parsing and printing library for the JSON5 format. +It supports parsing JSON5 to Yojson.Basic.t and Yojson.Safe.t types.") + (depends + (ocaml (>= 4.08)) + (sedlex (>= 2.5)) + (alcotest (and :with-test (>= 0.8.5))))) + diff --git a/lib/json5/ast.ml b/lib/json5/ast.ml new file mode 100644 index 00000000..8864a8fc --- /dev/null +++ b/lib/json5/ast.ml @@ -0,0 +1,37 @@ +type internal = + | Assoc of (string * internal) list + | List of internal list + | StringLit of string + | IntLit of string + | FloatLit of string + | Bool of bool + | Null + +let strip_quotes s = String.(sub s 1 (length s - 2)) + +let safe_strip_quotes s = + if String.(get s 0 = '"' && get s (length s - 1) = '"') then strip_quotes s + else s + +let rec to_basic = function + | Assoc l -> + `Assoc + (List.map (fun (name, obj) -> (safe_strip_quotes name, to_basic obj)) l) + | List l -> `List (List.map to_basic l) + | StringLit s -> `String (strip_quotes s) + | FloatLit s -> `Float (float_of_string s) + | IntLit s -> `Int (int_of_string s) + | Bool b -> `Bool b + | Null -> `Null + +let rec to_safe = function + | Assoc l -> + `Assoc + (List.map (fun (name, obj) -> (safe_strip_quotes name, to_safe obj)) l) + | List l -> `List (List.map to_safe l) + | StringLit s -> `String (strip_quotes s) + | FloatLit s -> `Float (float_of_string s) + | IntLit s -> ( + match int_of_string_opt s with Some i -> `Int i | None -> `Intlit s) + | Bool b -> `Bool b + | Null -> `Null diff --git a/lib/json5/basic.ml b/lib/json5/basic.ml new file mode 100644 index 00000000..02998593 --- /dev/null +++ b/lib/json5/basic.ml @@ -0,0 +1,7 @@ +include Yojson.Basic + +include Read.Make (struct + type t = Yojson.Basic.t + + let convert = Ast.to_basic +end) diff --git a/lib/json5/dune b/lib/json5/dune index 63b082c9..f0e63ab8 100644 --- a/lib/json5/dune +++ b/lib/json5/dune @@ -1,8 +1,6 @@ (library (name yojson_json5) - (public_name yojson_json5) + (public_name yojson-json5) (libraries yojson sedlex) (preprocess - (pps ppx_deriving.show sedlex.ppx ppx_deriving.eq) - ) -) + (pps sedlex.ppx))) diff --git a/lib/json5/let_syntax.ml b/lib/json5/let_syntax.ml new file mode 100644 index 00000000..83931e28 --- /dev/null +++ b/lib/json5/let_syntax.ml @@ -0,0 +1,3 @@ +module Result = struct + let ( let* ) = Result.bind +end diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 859fb1d4..ebadfd7b 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -1,100 +1,200 @@ -open Types +type token = + | OPEN_PAREN + | CLOSE_PAREN + | OPEN_BRACE + | CLOSE_BRACE + | OPEN_BRACKET + | CLOSE_BRACKET + | COLON + | COMMA + | COMMENT of string + | TRUE + | FALSE + | NULL + | FLOAT of string + | INT_OR_FLOAT of string + | INT of string + | STRING of string + | IDENTIFIER_NAME of string -(* From https://www.ecma-international.org/ecma-262/5.1/#sec-7 *) +let pp_token ppf = function + | OPEN_PAREN -> Format.fprintf ppf "'('" + | CLOSE_PAREN -> Format.fprintf ppf "')'" + | OPEN_BRACE -> Format.fprintf ppf "'{'" + | CLOSE_BRACE -> Format.fprintf ppf "'}'" + | OPEN_BRACKET -> Format.fprintf ppf "'['" + | CLOSE_BRACKET -> Format.fprintf ppf "']'" + | COLON -> Format.fprintf ppf "':'" + | COMMA -> Format.fprintf ppf "','" + | COMMENT s -> Format.fprintf ppf "COMMENT '%s'" s + | TRUE -> Format.fprintf ppf "'true'" + | FALSE -> Format.fprintf ppf "'false'" + | NULL -> Format.fprintf ppf "'null'" + | FLOAT s -> Format.fprintf ppf "FLOAT '%s'" s + | INT_OR_FLOAT s -> Format.fprintf ppf "INT_OR_FLOAT '%s'" s + | INT s -> Format.fprintf ppf "INT '%s'" s + | STRING s -> Format.fprintf ppf "STRING '%s'" s + | IDENTIFIER_NAME s -> Format.fprintf ppf "IDENTIFIER_NAME '%s'" s -(* -let digit = [%sedlex.regexp? '0'..'9'] -let number = [%sedlex.regexp? Plus digit] -*) let source_character = [%sedlex.regexp? any] let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029] -let line_terminator_sequence = [%sedlex.regexp? 0x000A | 0x000D, Opt 0x000A | 0x2028 | 0x2029] + +let line_terminator_sequence = + [%sedlex.regexp? 0x000A | 0x000D, Opt 0x000A | 0x2028 | 0x2029] (* NUMBERS, 7.8.3 *) -let non_zero_digit = [%sedlex.regexp? '1'..'9'] -let decimal_digit = [%sedlex.regexp? '0'..'9'] +let non_zero_digit = [%sedlex.regexp? '1' .. '9'] +let decimal_digit = [%sedlex.regexp? '0' .. '9'] let decimal_digits = [%sedlex.regexp? Plus decimal_digit] -let hex_digit = [%sedlex.regexp? '0'..'9'|'a'..'f'|'A'..'F'] -let exponent_indicator = [%sedlex.regexp? 'e'|'E'] -let signed_integer = [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits] +let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F'] +let exponent_indicator = [%sedlex.regexp? 'e' | 'E'] + +let signed_integer = + [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits] + let exponent_part = [%sedlex.regexp? exponent_indicator, signed_integer] -let decimal_integer_literal = [%sedlex.regexp? '0' | non_zero_digit, Opt decimal_digits] -let hex_integer_literal = [%sedlex.regexp? "0x", Plus hex_digit | "0X", Plus hex_digit] + +let decimal_integer_literal = + [%sedlex.regexp? '0' | non_zero_digit, Opt decimal_digits] + +let hex_integer_literal = + [%sedlex.regexp? "0x", Plus hex_digit | "0X", Plus hex_digit] + (* float *) -let float_literal = [%sedlex.regexp? decimal_integer_literal, '.', Opt decimal_digits, Opt exponent_part | '.', decimal_digits, Opt exponent_part] -let json5_float = [%sedlex.regexp? float_literal | '+', float_literal | '-', float_literal] +let float_literal = + [%sedlex.regexp? + ( decimal_integer_literal, '.', Opt decimal_digits, Opt exponent_part + | '.', decimal_digits, Opt exponent_part )] + +let json5_float = + [%sedlex.regexp? float_literal | '+', float_literal | '-', float_literal] + (* int_or_float *) -let int_or_float_literal = [%sedlex.regexp? decimal_integer_literal, Opt exponent_part] -let json5_int_or_float = [%sedlex.regexp? int_or_float_literal | '+', int_or_float_literal | '-', int_or_float_literal] +let int_or_float_literal = + [%sedlex.regexp? decimal_integer_literal, Opt exponent_part] + +let json5_int_or_float = + [%sedlex.regexp? + int_or_float_literal | '+', int_or_float_literal | '-', int_or_float_literal] + (* int/hex *) -let json5_int = [%sedlex.regexp? hex_integer_literal | '+', hex_integer_literal | '-', hex_integer_literal] +let int_literal = + [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits] + +let json5_int = + [%sedlex.regexp? + ( hex_integer_literal + | '+', hex_integer_literal + | '-', hex_integer_literal + | int_literal )] (* STRING LITERALS, 7.8.4 *) -let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] +let unicode_escape_sequence = + [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] + let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}] -let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u' ] -let non_escape_character = [%sedlex.regexp? Sub (source_character, ( escape_character | line_terminator ) ) ] -let character_escape_sequence = [%sedlex.regexp? single_escape_character | non_escape_character ] -let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence ] -let escape_sequence = [%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence ] (* TODO *) -let single_string_character = [%sedlex.regexp? Sub (source_character, ('\'' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] -let double_string_character = [%sedlex.regexp? Sub (source_character, ('"' | '\\' | line_terminator)) | '\\', escape_sequence | line_continuation ] -let string_literal = [%sedlex.regexp? '"', Star double_string_character, '"' | '\'', Star single_string_character, '\'' ] +let escape_character = + [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u'] + +let non_escape_character = + [%sedlex.regexp? Sub (source_character, (escape_character | line_terminator))] + +let character_escape_sequence = + [%sedlex.regexp? single_escape_character | non_escape_character] +let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence] + +let escape_sequence = + [%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence] + +let single_string_character = + [%sedlex.regexp? + ( Sub (source_character, ('\'' | '\\' | line_terminator)) + | '\\', escape_sequence + | line_continuation )] + +let double_string_character = + [%sedlex.regexp? + ( Sub (source_character, ('"' | '\\' | line_terminator)) + | '\\', escape_sequence + | line_continuation )] + +let string_literal = + [%sedlex.regexp? + ( '"', Star double_string_character, '"' + | '\'', Star single_string_character, '\'' )] (* IDENTIFIER_NAME (keys in objects) *) -let unicode_combining_mark =[%sedlex.regexp? mn | mc] +let unicode_combining_mark = [%sedlex.regexp? mn | mc] let unicode_digit = [%sedlex.regexp? nd] let unicode_connector_punctuation = [%sedlex.regexp? pc] let unicode_letter = [%sedlex.regexp? lu | ll | lt | lm | lo | nl] let zwnj = [%sedlex.regexp? 0x200C] let zwj = [%sedlex.regexp? 0x200D] -let identifier_start = [%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unicode_escape_sequence] -let identifier_part = [%sedlex.regexp? identifier_start | unicode_combining_mark | unicode_digit | unicode_connector_punctuation | zwnj | zwj] + +let identifier_start = + [%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unicode_escape_sequence] + +let identifier_part = + [%sedlex.regexp? + ( identifier_start | unicode_combining_mark | unicode_digit + | unicode_connector_punctuation | zwnj | zwj )] + let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part] (* COMMENTS 7.4 *) -let single_line_comment_char = [%sedlex.regexp? Sub (source_character, line_terminator)] +let single_line_comment_char = + [%sedlex.regexp? Sub (source_character, line_terminator)] + let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char] let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')] let multi_line_not_slash_char = [%sedlex.regexp? Sub (source_character, '/')] -let multi_line_comment_char = [%sedlex.regexp? multi_line_not_asterisk_char | '*', Plus multi_line_not_slash_char] -let multi_line_comment = [%sedlex.regexp? "/*", Star multi_line_comment_char, Opt '*', "*/"] + +let multi_line_comment_char = + [%sedlex.regexp? + multi_line_not_asterisk_char | '*', Plus multi_line_not_slash_char] + +let multi_line_comment = + [%sedlex.regexp? "/*", Star multi_line_comment_char, Opt '*', "*/"] + let comment = [%sedlex.regexp? multi_line_comment | single_line_comment] -let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] +let white_space = + [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] -let rec lex tokens buf = +let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = + fun tokens buf -> let lexeme = Sedlexing.Utf8.lexeme in match%sedlex buf with - | '{' -> lex (OPEN_BRACE::tokens) buf - | '}' -> lex (CLOSE_BRACE::tokens) buf - | '[' -> lex (OPEN_BRACKET::tokens) buf - | ']' -> lex (CLOSE_BRACKET::tokens) buf - | ':' -> lex (COLON::tokens) buf - | ',' -> lex (COMMA::tokens) buf - | comment - | white_space - | line_terminator -> lex tokens buf - | "true" -> lex (TRUE::tokens) buf - | "false" -> lex (FALSE::tokens) buf - | "null" -> lex (NULL::tokens) buf - | string_literal -> let s = lexeme buf in - lex (STRING s::tokens) buf + | '(' -> lex (OPEN_PAREN :: tokens) buf + | ')' -> lex (CLOSE_PAREN :: tokens) buf + | '{' -> lex (OPEN_BRACE :: tokens) buf + | '}' -> lex (CLOSE_BRACE :: tokens) buf + | '[' -> lex (OPEN_BRACKET :: tokens) buf + | ']' -> lex (CLOSE_BRACKET :: tokens) buf + | ':' -> lex (COLON :: tokens) buf + | ',' -> lex (COMMA :: tokens) buf + | multi_line_comment | single_line_comment | white_space | line_terminator -> + lex tokens buf + | "true" -> lex (TRUE :: tokens) buf + | "false" -> lex (FALSE :: tokens) buf + | "null" -> lex (NULL :: tokens) buf | json5_float -> - let s = float_of_string @@ lexeme buf in - lex (FLOAT s::tokens) buf - | json5_int_or_float -> - let s = lexeme buf in - lex (INT_OR_FLOAT s::tokens) buf + let s = lexeme buf in + lex (FLOAT s :: tokens) buf | json5_int -> - let s = int_of_string @@ lexeme buf in - lex (INT s::tokens) buf + let s = lexeme buf in + lex (INT s :: tokens) buf + | json5_int_or_float -> + let s = lexeme buf in + lex (INT_OR_FLOAT s :: tokens) buf | identifier_name -> - let s = lexeme buf in - lex (IDENTIFIER_NAME s::tokens) buf - | eof -> List.rev tokens + let s = lexeme buf in + lex (IDENTIFIER_NAME s :: tokens) buf + | string_literal -> + let s = lexeme buf in + lex (STRING s :: tokens) buf + | eof -> Ok (List.rev tokens) | _ -> - let s = lexeme buf in - failwith @@ "Unexpected character: '" ^ s ^ "'" + lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index f3293bac..815c02e0 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -1,62 +1,84 @@ -open Types - -let escape_string x = x +open Let_syntax.Result let rec parse_list acc = function - | [] -> failwith "List never ends" - | CLOSE_BRACKET::xs - | COMMA::CLOSE_BRACKET::xs -> (acc, xs) - | xs -> ( - let (v, xs) = parse xs in - match xs with - | [] -> failwith "List was not closed" - | CLOSE_BRACKET::xs - | COMMA::CLOSE_BRACKET::xs -> (v::acc, xs) - | COMMA::xs -> parse_list (v::acc) xs - | x::_ -> - let s = Format.asprintf "Unexpected list token: %a" pp_token x in - failwith s) - + | [] -> Error "List never ends" + | Lexer.CLOSE_BRACKET :: xs | COMMA :: CLOSE_BRACKET :: xs -> Ok (acc, xs) + | xs -> ( + let* v, xs = parse xs in + match xs with + | [] -> Error "List was not closed" + | Lexer.CLOSE_BRACKET :: xs | COMMA :: CLOSE_BRACKET :: xs -> + Ok (v :: acc, xs) + | COMMA :: xs -> parse_list (v :: acc) xs + | x :: _ -> + let s = + Format.asprintf "Unexpected list token: %a" Lexer.pp_token x + in + Error s) + and parse_assoc acc = function - | [] -> failwith "Assoc never ends" - | CLOSE_BRACE::xs - | COMMA::CLOSE_BRACE::xs -> (acc, xs) - | (STRING k)::COLON::xs - | (IDENTIFIER_NAME k)::COLON::xs -> ( - let (v, xs) = parse xs in - let item = (k, v) in - match xs with - | [] -> failwith "Object was not closed" - | CLOSE_BRACE::xs - | COMMA::CLOSE_BRACE::xs -> (item::acc, xs) - | COMMA::xs -> parse_assoc (item::acc) xs - | x::_ -> - let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in - failwith s) - | x::_ -> - let s = Format.asprintf "Unexpected assoc list token: %a" pp_token x in - failwith s + | [] -> Error "Assoc never ends" + | Lexer.CLOSE_BRACE :: xs | COMMA :: CLOSE_BRACE :: xs -> Ok (acc, xs) + | STRING k :: COLON :: xs | IDENTIFIER_NAME k :: COLON :: xs -> ( + let* v, xs = parse xs in + let item = (k, v) in + match xs with + | [] -> Error "Object was not closed" + | Lexer.CLOSE_BRACE :: xs | COMMA :: CLOSE_BRACE :: xs -> + Ok (item :: acc, xs) + | COMMA :: xs -> parse_assoc (item :: acc) xs + | x :: _ -> + let s = + Format.asprintf "Unexpected assoc list token: %a" Lexer.pp_token x + in + Error s) + | x :: _ -> + let s = + Format.asprintf "Unexpected assoc list token: %a" Lexer.pp_token x + in + Error s + +and parse = function + | [] -> Error "empty list of tokens" + | token :: xs -> ( + match token with + | TRUE -> Ok (Ast.Bool true, xs) + | FALSE -> Ok (Bool false, xs) + | NULL -> Ok (Null, xs) + | INT v -> Ok (IntLit v, xs) + | FLOAT v -> Ok (FloatLit v, xs) + | INT_OR_FLOAT v -> Ok (FloatLit v, xs) + | STRING s -> Ok (StringLit s, xs) + | OPEN_BRACKET -> + let* l, xs = parse_list [] xs in + Ok (Ast.List (List.rev l), xs) + | OPEN_BRACE -> + let* a, xs = parse_assoc [] xs in + Ok (Ast.Assoc (List.rev a), xs) + | x -> + let s = Format.asprintf "Unexpected token: %a" Lexer.pp_token x in + Error s) -and parse : token list -> (t * token list) = function - | [] -> failwith "empty list of tokens" - | token::xs -> - match token with - | TRUE -> (`Bool true, xs) - | FALSE -> (`Bool false, xs) - | NULL -> (`Null, xs) - | INT v -> (`Int v, xs) - | FLOAT v -> (`Float v, xs) - | INT_OR_FLOAT v -> (`String v, xs) - | STRING s -> (`String (escape_string s), xs) - | OPEN_BRACKET -> - let (l, xs) = parse_list [] xs in - (`List (List.rev l), xs) - | OPEN_BRACE -> - let (a, xs) = parse_assoc [] xs in - (`Assoc (List.rev a), xs) - | x -> - let s = Format.asprintf "Unexpected token: %a" pp_token x in - failwith s +let parse_from_lexbuf ?fname ?lnum lexbuffer = + let fname = Option.value fname ~default:"" in + Sedlexing.set_filename lexbuffer fname; + let lnum = Option.value lnum ~default:1 in + let pos = + { Lexing.pos_fname = fname; pos_lnum = lnum; pos_bol = 0; pos_cnum = 0 } + in + Sedlexing.set_position lexbuffer pos; + let* tokens = Lexer.lex [] lexbuffer in + let* ast = parse tokens in + Ok (fst ast) +let parse_from_string ?fname ?lnum input = + parse_from_lexbuf (Sedlexing.Utf8.from_string input) ?fname ?lnum +let parse_from_channel ?fname ?lnum ic = + parse_from_lexbuf (Sedlexing.Utf8.from_channel ic) ?fname ?lnum +let parse_from_file ?fname ?lnum filename = + let ic = open_in filename in + let out = parse_from_channel ?fname ?lnum ic in + close_in ic; + out diff --git a/lib/json5/read.ml b/lib/json5/read.ml new file mode 100644 index 00000000..4f4cacbf --- /dev/null +++ b/lib/json5/read.ml @@ -0,0 +1,34 @@ +open Let_syntax.Result + +module type S = sig + type t + + val convert : Ast.internal -> t +end + +module type Out = sig + type t + + val from_string : ?fname:string -> ?lnum:int -> string -> (t, string) result + + val from_channel : + ?fname:string -> ?lnum:int -> in_channel -> (t, string) result + + val from_file : ?fname:string -> ?lnum:int -> string -> (t, string) result +end + +module Make (F : S) : Out with type t = F.t = struct + type t = F.t + + let from_string ?fname ?lnum input = + let* ast = Parser.parse_from_string ?fname ?lnum input in + Ok (F.convert ast) + + let from_channel ?fname ?lnum ic = + let* ast = Parser.parse_from_channel ?fname ?lnum ic in + Ok (F.convert ast) + + let from_file ?fname ?lnum file = + let* ast = Parser.parse_from_file ?fname ?lnum file in + Ok (F.convert ast) +end diff --git a/lib/json5/safe.ml b/lib/json5/safe.ml new file mode 100644 index 00000000..1eace77c --- /dev/null +++ b/lib/json5/safe.ml @@ -0,0 +1,7 @@ +include Yojson.Safe + +include Read.Make (struct + type t = Yojson.Safe.t + + let convert = Ast.to_safe +end) diff --git a/lib/json5/types.ml b/lib/json5/types.ml deleted file mode 100644 index ac98404d..00000000 --- a/lib/json5/types.ml +++ /dev/null @@ -1,18 +0,0 @@ -type token = - | OPEN_BRACE - | CLOSE_BRACE - | OPEN_BRACKET - | CLOSE_BRACKET - | COLON - | COMMA - | TRUE - | FALSE - | NULL - | FLOAT of float - | INT_OR_FLOAT of string - | INT of int - | STRING of string - | IDENTIFIER_NAME of string - [@@deriving show, eq] - -type t = Yojson.Safe.t diff --git a/lib/json5/yojson_json5.ml b/lib/json5/yojson_json5.ml new file mode 100644 index 00000000..58606838 --- /dev/null +++ b/lib/json5/yojson_json5.ml @@ -0,0 +1,2 @@ +module Safe = Safe +module Basic = Basic diff --git a/lib/json5/yojson_json5.mli b/lib/json5/yojson_json5.mli new file mode 100644 index 00000000..8e069cde --- /dev/null +++ b/lib/json5/yojson_json5.mli @@ -0,0 +1,71 @@ +module Safe : sig + type t = Yojson.Safe.t + + val from_string : ?fname:string -> ?lnum:int -> string -> (t, string) result + + val from_channel : + ?fname:string -> ?lnum:int -> in_channel -> (t, string) result + + val from_file : ?fname:string -> ?lnum:int -> string -> (t, string) result + + val to_string : + ?buf:Buffer.t -> ?len:int -> ?suf:string -> ?std:bool -> t -> string + + val to_channel : + ?buf:Stdlib.Buffer.t -> + ?len:int -> + ?suf:string -> + ?std:bool -> + Stdlib.out_channel -> + t -> + unit + + val to_output : + ?buf:Stdlib.Buffer.t -> + ?len:int -> + ?suf:string -> + ?std:bool -> + < output : string -> int -> int -> int > -> + t -> + unit + + val to_file : ?len:int -> ?std:bool -> ?suf:string -> string -> t -> unit + val pp : Format.formatter -> t -> unit + val equal : t -> t -> bool +end + +module Basic : sig + type t = Yojson.Basic.t + + val from_string : ?fname:string -> ?lnum:int -> string -> (t, string) result + + val from_channel : + ?fname:string -> ?lnum:int -> in_channel -> (t, string) result + + val from_file : ?fname:string -> ?lnum:int -> string -> (t, string) result + + val to_string : + ?buf:Buffer.t -> ?len:int -> ?suf:string -> ?std:bool -> t -> string + + val to_channel : + ?buf:Stdlib.Buffer.t -> + ?len:int -> + ?suf:string -> + ?std:bool -> + Stdlib.out_channel -> + t -> + unit + + val to_output : + ?buf:Stdlib.Buffer.t -> + ?len:int -> + ?suf:string -> + ?std:bool -> + < output : string -> int -> int -> int > -> + t -> + unit + + val to_file : ?len:int -> ?std:bool -> ?suf:string -> string -> t -> unit + val pp : Format.formatter -> t -> unit + val equal : t -> t -> bool +end diff --git a/test/json5/dune b/test/json5/dune deleted file mode 100644 index 85583f9a..00000000 --- a/test/json5/dune +++ /dev/null @@ -1,5 +0,0 @@ -(executable - (name json5_test) - (libraries yojson_json5 alcotest) - (preprocess - (pps ppx_deriving.show ppx_deriving.eq))) diff --git a/test/json5/json5_test.ml b/test/json5/json5_test.ml deleted file mode 100644 index be6b6614..00000000 --- a/test/json5/json5_test.ml +++ /dev/null @@ -1,127 +0,0 @@ -module Lexer = Yojson_json5.Lexer -open Yojson_json5.Types - -let tokenize_json5 (json_string) = - let buf = Sedlexing.Utf8.from_string json_string in - Lexer.lex [] buf - -let token = Alcotest.testable pp_token equal_token - -let test_float () = - Alcotest.(check (list token)) "Simple" [FLOAT 23.52] (tokenize_json5 "23.52"); - Alcotest.(check (list token)) "No leading number" [FLOAT 0.52] (tokenize_json5 ".52"); - Alcotest.(check (list token)) "With exponent" [FLOAT 210.; FLOAT 210.] (tokenize_json5 "2.1e2 2.1E2") - -let test_int_or_float () = - Alcotest.(check (list token)) "Int or float" [INT_OR_FLOAT "42"] (tokenize_json5 "42") - -let test_int () = - Alcotest.(check (list token)) "Hex/Int" [INT 16] (tokenize_json5 "0x10") - -let test_string () = - Alcotest.(check (list token)) "Doublequoted simple" [STRING "\"hello\""] (tokenize_json5 "\"hello\""); - Alcotest.(check (list token)) "Doublequoted single-character escape sequence" [STRING {|"\'\"\\\b\f\n\r\t\v"|}] (tokenize_json5 {|"\'\"\\\b\f\n\r\t\v"|}); - Alcotest.(check (list token)) "Doublequoted non-escape-character escape sequence" [STRING {|"foo\z"|}] (tokenize_json5 {|"foo\z"|}); - Alcotest.(check (list token)) "Doublequoted zero escape sequence" [STRING {|"\0"|}] (tokenize_json5 {|"\0"|}); - (* Alcotest.check_raises "Doublequoted zero then one escape sequence" (Failure "Unexpected character: ''") (fun () -> ignore @@ tokenize_json5 {|"\01"|}); *) - Alcotest.(check (list token)) "Doublequoted unicode escape" [STRING "\"\\uD83D\\uDC2A\""] (tokenize_json5 "\"\\uD83D\\uDC2A\""); - Alcotest.(check (list token)) "Doublequoted line continuation" [STRING "\"hel\\\nlo\""] (tokenize_json5 "\"hel\\\nlo\""); - Alcotest.(check (list token)) "Singlequoted simple" [STRING "'hello'"] (tokenize_json5 "'hello'"); - Alcotest.(check (list token)) "Singlequoted single-character escape sequence" [STRING {|'\'\"\\\b\f\n\r\t\v'|}] (tokenize_json5 {|'\'\"\\\b\f\n\r\t\v'|}); - Alcotest.(check (list token)) "Singlequoted non-escape-character escape sequence" [STRING {|'\z'|}] (tokenize_json5 {|'\z'|}); - Alcotest.(check (list token)) "Singlequoted zero escape sequence" [STRING {|'\0'|}] (tokenize_json5 {|'\0'|}); - Alcotest.(check (list token)) "Singlequoted unicode escape" [STRING "'\\uD83D\\uDC2A'"] (tokenize_json5 "'\\uD83D\\uDC2A'"); - Alcotest.(check (list token)) "Singlequoted line continuation" [STRING "'hel\\\nlo'"] (tokenize_json5 "'hel\\\nlo'"); - (* Alcotest.(check (list token)) "Singlequoted one escape sequence" [STRING {|'\1'|}] (tokenize_json5 {|'\1'|}); *) - () - -let test_identifier () = - Alcotest.(check (list token)) - "Identifer name in an object" - [OPEN_BRACE; IDENTIFIER_NAME "hj"; COLON; INT_OR_FLOAT "42"; CLOSE_BRACE] - (tokenize_json5 "{hj: 42}") - -let test_multi_line_comments () = - Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "/* hello\nworld */"); - Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/* hello\nworld */1"); - Alcotest.(check (list token)) "Empty" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/**/1"); - Alcotest.(check (list token)) "Contains slash" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/*/*/1"); - Alcotest.(check (list token)) "Contains asterisk" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/***/1"); - Alcotest.(check (list token)) "Contains double asterisk" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1/****/1"); - Alcotest.check_raises "Contains comment end" (Failure "Unexpected character: ''") (fun () -> ignore @@ tokenize_json5 "/* */ */") - -let test_single_line_comments () = - Alcotest.(check (list token)) "Simple case" [] (tokenize_json5 "//foo\n"); - Alcotest.(check (list token)) "Between numbers" [INT_OR_FLOAT "1"; INT_OR_FLOAT "1"] (tokenize_json5 "1//foo\n1") - - -(** - * PARSING - *) - -let yojson = Alcotest.testable Yojson.Safe.pp Yojson.Safe.equal - -let parse tokens = - let (json, _) = Yojson_json5.Parser.parse tokens in - json - -let test_parser_simple () = - Alcotest.(check yojson) "Simple null" `Null (parse [NULL]); - Alcotest.(check yojson) "Simple true" (`Bool true) (parse [TRUE]); - Alcotest.(check yojson) "Simple false" (`Bool false) (parse [FALSE]); - Alcotest.(check yojson) "Simple int" (`Int 3) (parse [INT 3]); - Alcotest.(check yojson) "Simple float" (`Float 3.4) (parse [FLOAT 3.4]); - () - -let test_parser_string () = - Alcotest.(check yojson) "Simple string" (`String "hello") (parse [STRING "hello"]); - Alcotest.(check yojson) "Escape sequences" (`String "a\'\"\\\b\n\r\ta") (parse [STRING "a\'\"\\\b\n\r\ta"]); - () - -let test_parser_list () = - Alcotest.(check yojson) "Empty list" (`List []) (parse [OPEN_BRACKET; CLOSE_BRACKET]); - Alcotest.(check yojson) "List with bools" (`List [`Bool false; `Bool true]) (parse [OPEN_BRACKET; FALSE; COMMA; TRUE; CLOSE_BRACKET]); - Alcotest.(check yojson) "List of lists" (`List [ `List []; `Null ]) (parse [OPEN_BRACKET; OPEN_BRACKET; CLOSE_BRACKET; COMMA; NULL; CLOSE_BRACKET]); - Alcotest.(check yojson) "List with trailing comma" (`List [ `Null ]) (parse [OPEN_BRACKET; NULL; COMMA; CLOSE_BRACKET]); - Alcotest.(check yojson) "List of list with content" (`List [ `List [ `Bool true ] ]) (parse [OPEN_BRACKET; OPEN_BRACKET; TRUE; CLOSE_BRACKET; CLOSE_BRACKET]); - () - -let test_parser_object () = - Alcotest.(check yojson) "Empty object" (`Assoc []) (parse [OPEN_BRACE; CLOSE_BRACE]); - Alcotest.(check yojson) "String key" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; NULL; CLOSE_BRACE]); - Alcotest.(check yojson) "Identifer key" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; IDENTIFIER_NAME "foo"; COLON; NULL; CLOSE_BRACE]); - Alcotest.(check yojson) "Trailing comma" (`Assoc [ ("foo", `Null) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; NULL; COMMA; CLOSE_BRACE]); - Alcotest.(check yojson) "Nested object" (`Assoc [ ("foo", `Assoc [ ("bar", `Null) ]) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; OPEN_BRACE; STRING "bar"; COLON; NULL; CLOSE_BRACE; COMMA; CLOSE_BRACE]); - Alcotest.(check yojson) "Mixed keys and values" (`Assoc [ ("foo", `Bool true); ("bar", `Null); ("baz", `Bool false) ]) (parse [OPEN_BRACE; STRING "foo"; COLON; TRUE; COMMA; IDENTIFIER_NAME "bar"; COLON; NULL; COMMA; IDENTIFIER_NAME "baz"; COLON; FALSE; CLOSE_BRACE]); - () - - -(** - * RUN - *) - -let () = - let open Alcotest in - run "JSON5" [ - "Numbers", [ - test_case "Float" `Quick test_float; - test_case "Int or float" `Quick test_int_or_float; - test_case "Int" `Quick test_int; - ]; - "Strings", [ - test_case "String" `Quick test_string; - ]; - "Objects", [ - test_case "Identifiers" `Quick test_identifier; - ]; - "Comments", [ - test_case "Multi-line comments" `Quick test_multi_line_comments; - test_case "Single-line comments" `Quick test_single_line_comments; - ]; - "Parse", [ - test_case "Simple parsing" `Quick test_parser_simple; - test_case "Strings" `Quick test_parser_string; - test_case "Simple list parsing" `Quick test_parser_list; - test_case "Simple object parsing" `Quick test_parser_object; - ]; - ] diff --git a/test_json5/dune b/test_json5/dune new file mode 100644 index 00000000..46b63c12 --- /dev/null +++ b/test_json5/dune @@ -0,0 +1,4 @@ +(test + (name test) + (package yojson-json5) + (libraries alcotest yojson_json5)) diff --git a/test_json5/test.ml b/test_json5/test.ml new file mode 100644 index 00000000..3488a54c --- /dev/null +++ b/test_json5/test.ml @@ -0,0 +1,92 @@ +module M = struct + include Yojson_json5.Safe + + let from_string s = + match from_string s with + | Ok t -> t + | Error e -> raise (Yojson.Json_error e) +end + +let yojson_json5 = Alcotest.testable M.pp M.equal + +let test_from_string () = + Alcotest.(check yojson_json5) "Empty object" (`Assoc []) (M.from_string "{}"); + Alcotest.(check yojson_json5) "Empty list" (`List []) (M.from_string "[]"); + Alcotest.(check yojson_json5) + "List" + (`List [ `Int 1; `String "2"; `Float 3. ]) + (M.from_string "[1, \"2\", 3.0]"); + Alcotest.(check yojson_json5) "true" (`Bool true) (M.from_string "true"); + Alcotest.(check yojson_json5) "false" (`Bool false) (M.from_string "false"); + Alcotest.(check yojson_json5) "null" `Null (M.from_string "null"); + Alcotest.(check yojson_json5) + "double quotes string" (`String "hello world") + (M.from_string {|"hello world"|}); + Alcotest.(check yojson_json5) + "single quotes string" (`String "hello world") + (M.from_string {|'hello world'|}); + Alcotest.(check yojson_json5) + "float" (`Float 12345.67890) + (M.from_string "12345.67890"); + Alcotest.(check yojson_json5) "hex" (`Int 0x1) (M.from_string "0x1"); + Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1"); + Alcotest.(check yojson_json5) + "line break" (`String "foo\\\nbar") + (M.from_string "\"foo\\\nbar\""); + Alcotest.(check yojson_json5) + "string and comment" (`String "bar") + (M.from_string "\"bar\" //foo"); + let expected = + `Assoc + [ + ("unquoted", `String "and you can quote me on that"); + ("singleQuotes", `String "I can use \"double quotes\" here"); + ("lineBreaks", `String {|Look, Mom! \ +No \\n's!|}); + ("hexadecimal", `Int 0xdecaf); + ("leadingDecimalPoint", `Float 0.8675309); + ("andTrailing", `Float 8675309.0); + ("positiveSign", `Int 1); + ("trailingComma", `String "in objects"); + ("andIn", `List [ `String "arrays" ]); + ("backwardsCompatible", `String "with JSON"); + ] + in + Alcotest.(check yojson_json5) + "More elaborated" expected + (M.from_string + {|{ + // comments + unquoted: 'and you can quote me on that', + singleQuotes: 'I can use "double quotes" here', + lineBreaks: "Look, Mom! \ +No \\n's!", + hexadecimal: 0xdecaf, + leadingDecimalPoint: .8675309, andTrailing: 8675309., + positiveSign: +1, + trailingComma: 'in objects', andIn: ['arrays',], + "backwardsCompatible": "with JSON", +}|}) + +let test_to_string () = + Alcotest.(check string) "Empty object" "{}" (M.to_string (`Assoc [])); + Alcotest.(check string) "Empty list" "[]" (M.to_string (`List [])); + Alcotest.(check string) "true" "true" (M.to_string (`Bool true)); + Alcotest.(check string) "false" "false" (M.to_string (`Bool false)); + Alcotest.(check string) "null" "null" (M.to_string `Null); + Alcotest.(check string) + "string" "\"hello world\"" + (M.to_string (`String "hello world")); + Alcotest.(check string) "float" "12345.6789" (M.to_string (`Float 12345.6789)); + Alcotest.(check string) "hex" "1" (M.to_string (`Int 0x1)); + Alcotest.(check string) "int" "1" (M.to_string (`Int 1)) + +(* Run it *) +let () = + let open Alcotest in + run "JSON5" + [ + ( "from_string", + [ test_case "reading from string" `Quick test_from_string ] ); + ("to_string", [ test_case "write to string" `Quick test_to_string ]); + ] diff --git a/yojson-json5.opam b/yojson-json5.opam new file mode 100644 index 00000000..c458ef00 --- /dev/null +++ b/yojson-json5.opam @@ -0,0 +1,37 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: + "Yojson_json5 is a parsing and printing library for the JSON5 format" +description: """ +Yojson_json5 is a parsing and printing library for the JSON5 format. +It supports parsing JSON5 to Yojson.Basic.t and Yojson.Safe.t types.""" +maintainer: [ + "paul-elliot@tarides.com" "nathan@tarides.com" "marek@tarides.com" +] +authors: ["Martin Jambon"] +license: "BSD-3-Clause" +homepage: "https://github.com/ocaml-community/yojson" +doc: "https://ocaml-community.github.io/yojson" +bug-reports: "https://github.com/ocaml-community/yojson/issues" +depends: [ + "dune" {>= "2.7"} + "ocaml" {>= "4.08"} + "sedlex" {>= "2.5"} + "alcotest" {with-test & >= "0.8.5"} + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/ocaml-community/yojson.git" diff --git a/yojson_json5.opam b/yojson_json5.opam deleted file mode 100644 index 712d4c4d..00000000 --- a/yojson_json5.opam +++ /dev/null @@ -1,32 +0,0 @@ -opam-version: "2.0" -maintainer: ["nathan@cryptosense.com" "marek@xivilization.net"] -authors: ["Martin Jambon"] -homepage: "https://github.com/ocaml-community/yojson" -bug-reports: "https://github.com/ocaml-community/yojson/issues" -dev-repo: "git+https://github.com/ocaml-community/yojson.git" -doc: "https://ocaml-community.github.io/yojson/" -build: [ - ["dune" "subst"] {pinned} - ["dune" "build" "-p" name "-j" jobs] -] -run-test: [["dune" "runtest" "-p" name "-j" jobs]] -depends: [ - "ocaml" {>= "4.02.3"} - "dune" - "sedlex" - "alcotest" {with-test & >= "0.8.5"} - "ppx_deriving" -] -synopsis: - "Yojson is an optimized parsing and printing library for the JSON format" -description: """ -Yojson is an optimized parsing and printing library for the JSON format. - -It addresses a few shortcomings of json-wheel including 2x speedup, -polymorphic variants and optional syntax for tuples and variants. - -ydump is a pretty-printing command-line program provided with the -yojson package. - -The program atdgen can be used to derive OCaml-JSON serializers and -deserializers from type definitions.""" From f55e517b9198350d4c1526c8b2f7fdc13e177b4c Mon Sep 17 00:00:00 2001 From: Daniel Hilst Date: Sun, 25 Sep 2022 09:42:08 -0300 Subject: [PATCH 13/24] Add unicode, hex, octal, \n and friends escaping support --- lib/json5/lexer.ml | 69 ++++++++++++++++++++++++++++++++---- lib/json5/unescape.ml | 81 +++++++++++++++++++++++++++++++++++++++++++ test_json5/test.ml | 14 ++++++++ 3 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 lib/json5/unescape.ml diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index ebadfd7b..611ac09a 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -1,3 +1,5 @@ +open Let_syntax.Result + type token = | OPEN_PAREN | CLOSE_PAREN @@ -92,7 +94,7 @@ let json5_int = let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] -let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}] +let single_escape_character = [%sedlex.regexp? Chars {|'"\bfnrtv|}] let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u'] @@ -104,9 +106,13 @@ let character_escape_sequence = [%sedlex.regexp? single_escape_character | non_escape_character] let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence] +let hex_escape_sequence = [%sedlex.regexp? 'x', hex_digit, hex_digit] let escape_sequence = - [%sedlex.regexp? character_escape_sequence | '0' | unicode_escape_sequence] + [%sedlex.regexp? + ( character_escape_sequence + | '0', Opt (decimal_digit, decimal_digit) + | hex_escape_sequence | unicode_escape_sequence )] let single_string_character = [%sedlex.regexp? @@ -163,8 +169,57 @@ let comment = [%sedlex.regexp? multi_line_comment | single_line_comment] let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] -let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = - fun tokens buf -> +let string_lex_single lexbuf strbuf = + Buffer.add_char strbuf '\''; + let lexeme = Sedlexing.Utf8.lexeme in + let rec lex lexbuf strbuf = + match%sedlex lexbuf with + | '\'' -> + Buffer.add_char strbuf '\''; + Ok (Buffer.contents strbuf) + | '\\', escape_sequence -> + let* s = Unescape.unescape (lexeme lexbuf) in + Buffer.add_string strbuf s; + lex lexbuf strbuf + | Sub (source_character, '\'') -> + Buffer.add_string strbuf (lexeme lexbuf); + lex lexbuf strbuf + | _ -> + lexeme lexbuf + |> Format.asprintf "Unexpected character: %s" + |> Result.error + in + lex lexbuf strbuf + +let string_lex_double lexbuf strbuf = + Buffer.add_char strbuf '"'; + let lexeme = Sedlexing.Utf8.lexeme in + let rec lex lexbuf strbuf = + match%sedlex lexbuf with + | '"' -> + Buffer.add_char strbuf '"'; + Ok (Buffer.contents strbuf) + | '\\', escape_sequence -> + let* s = Unescape.unescape (lexeme lexbuf) in + Buffer.add_string strbuf s; + lex lexbuf strbuf + | Sub (source_character, '"') -> + Buffer.add_string strbuf (lexeme lexbuf); + lex lexbuf strbuf + | _ -> + lexeme lexbuf + |> Format.asprintf "Unexpected character: %s" + |> Result.error + in + lex lexbuf strbuf + +let string_lex lexbuf quote = + let strbuf = Buffer.create 200 in + if quote = "'" then string_lex_single lexbuf strbuf + else if quote = "\"" then string_lex_double lexbuf strbuf + else Error (Format.sprintf "invalid string quote %s" quote) + +let rec lex tokens buf = let lexeme = Sedlexing.Utf8.lexeme in match%sedlex buf with | '(' -> lex (OPEN_PAREN :: tokens) buf @@ -175,6 +230,9 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = | ']' -> lex (CLOSE_BRACKET :: tokens) buf | ':' -> lex (COLON :: tokens) buf | ',' -> lex (COMMA :: tokens) buf + | Chars {|"'|} -> + let* s = string_lex buf (lexeme buf) in + lex (STRING s :: tokens) buf | multi_line_comment | single_line_comment | white_space | line_terminator -> lex tokens buf | "true" -> lex (TRUE :: tokens) buf @@ -192,9 +250,6 @@ let rec lex : token list -> Sedlexing.lexbuf -> (token list, string) result = | identifier_name -> let s = lexeme buf in lex (IDENTIFIER_NAME s :: tokens) buf - | string_literal -> - let s = lexeme buf in - lex (STRING s :: tokens) buf | eof -> Ok (List.rev tokens) | _ -> lexeme buf |> Format.asprintf "Unexpected character: '%s'" |> Result.error diff --git a/lib/json5/unescape.ml b/lib/json5/unescape.ml new file mode 100644 index 00000000..63e571b0 --- /dev/null +++ b/lib/json5/unescape.ml @@ -0,0 +1,81 @@ +open Let_syntax.Result + +let ( % ) = Int.logor +let ( << ) = Int.shift_left +let ( >> ) = Int.shift_right +let ( & ) = Int.logand + +let utf_8_string_of_unicode i = + if i <= 0x007F then ( + let b = Bytes.create 1 in + Bytes.set_int8 b 0 i; + Ok (Bytes.to_string b)) + else if i <= 0x07FF then ( + let five_high_bits = i >> 6 & 0b11111 in + let six_low_bits = i & 0b111111 in + let high = 0b11000000 % five_high_bits << 8 in + let low = 0b10000000 % six_low_bits in + let n = high % low in + let b = Bytes.create 2 in + Bytes.set_int16_be b 0 n; + Ok (Bytes.to_string b)) + else if i <= 0xFFFF then ( + let four_high_bits = i >> 12 & 0b1111 in + let six_mid_bits = i >> 6 & 0b111111 in + let six_low_bits = i & 0b111111 in + let high = 0b11100000 % four_high_bits << 16 in + let mid = 0b10000000 % six_mid_bits << 8 in + let low = 0b10000000 % six_low_bits in + let n = high % mid % low in + let b = Bytes.create 3 in + Bytes.set_int32_be b 0 (Int32.of_int n); + Ok (Bytes.to_string b)) + else if i <= 0x10FFFF then ( + let three_hh_bits = i >> 18 & 0b111 in + let six_hl_bits = i >> 12 & 0b111111 in + let six_lh_bits = i >> 6 & 0b111111 in + let six_ll_bits = i & 0b111111 in + let hh = 0b11110000 % three_hh_bits << 24 in + let hl = 0b10000000 % six_hl_bits << 16 in + let lh = 0b10000000 % six_lh_bits << 8 in + let ll = 0b10000000 % six_ll_bits in + let n = hh % hl % lh % ll in + let b = Bytes.create 4 in + Bytes.set_int32_be b 0 (Int32.of_int n); + Ok (Bytes.to_string b)) + else Error (Format.sprintf "invalid code point %X" i) + +let unescape str = + if String.length str < 2 then + Error (Format.sprintf "too small escape sequence %s" str) + else + match str.[1] with + | 'u' -> + let escape_chars = String.sub str 2 4 in + let* as_int = + Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" escape_chars) + in + utf_8_string_of_unicode as_int + | 'x' -> + let escape_chars = String.sub str 2 2 in + let* as_int = + Format.sprintf "0x%s" escape_chars |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" escape_chars) + in + utf_8_string_of_unicode as_int + | '\\' | '"' | 'n' | 't' -> Ok str + | '0' -> + if String.length str = 2 then Ok "\x00" + else if String.length str = 4 then + let octal_str = String.(sub str 2 2) in + let* as_int = + Format.sprintf "0o%s" octal_str |> int_of_string_opt |> function + | Some x -> Ok x + | None -> Error (Format.sprintf "bad escape sequence %s" octal_str) + in + utf_8_string_of_unicode as_int + else Error (Format.sprintf "invalid octal sequence %s" str) + | _ -> Error (Format.sprintf "invalid escape sequence %c" str.[1]) diff --git a/test_json5/test.ml b/test_json5/test.ml index 3488a54c..1b853e57 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -29,6 +29,20 @@ let test_from_string () = "float" (`Float 12345.67890) (M.from_string "12345.67890"); Alcotest.(check yojson_json5) "hex" (`Int 0x1) (M.from_string "0x1"); + Alcotest.(check yojson_json5) + "hex escape sequence" (`String "a") (M.from_string {|"\x61"|}); + Alcotest.(check yojson_json5) + "unicode escape sequence" (`String "λ") + (M.from_string {|"\u03bb"|}); + Alcotest.(check yojson_json5) + "more string escaping" (`String "Hello λ world") + (M.from_string "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""); + Alcotest.(check yojson_json5) + "null byte string" (`String "\x00") (M.from_string {|"\0"|}); + Alcotest.(check yojson_json5) + "octal string" (`String "?") (M.from_string {|"\077"|}); + Alcotest.(check yojson_json5) + "null and octal string" (`String "\x007") (M.from_string {|"\07"|}); Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1"); Alcotest.(check yojson_json5) "line break" (`String "foo\\\nbar") From 66dc60522f7146e012f58e288da0b5dc9c49f634 Mon Sep 17 00:00:00 2001 From: Daniel Hilst Date: Sun, 11 Sep 2022 10:44:34 -0300 Subject: [PATCH 14/24] Add more tests for JSON5 --- lib/json5/lexer.ml | 8 +++++--- lib/json5/unescape.ml | 3 ++- test_json5/test.ml | 19 ++++++++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 611ac09a..2ec76ba9 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -94,7 +94,7 @@ let json5_int = let unicode_escape_sequence = [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit] -let single_escape_character = [%sedlex.regexp? Chars {|'"\bfnrtv|}] +let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}] let escape_character = [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u'] @@ -181,7 +181,8 @@ let string_lex_single lexbuf strbuf = let* s = Unescape.unescape (lexeme lexbuf) in Buffer.add_string strbuf s; lex lexbuf strbuf - | Sub (source_character, '\'') -> + | line_continuation -> lex lexbuf strbuf + | Sub (source_character, ('\'' | line_terminator)) -> Buffer.add_string strbuf (lexeme lexbuf); lex lexbuf strbuf | _ -> @@ -203,7 +204,8 @@ let string_lex_double lexbuf strbuf = let* s = Unescape.unescape (lexeme lexbuf) in Buffer.add_string strbuf s; lex lexbuf strbuf - | Sub (source_character, '"') -> + | line_continuation -> lex lexbuf strbuf + | Sub (source_character, ('"' | line_terminator)) -> Buffer.add_string strbuf (lexeme lexbuf); lex lexbuf strbuf | _ -> diff --git a/lib/json5/unescape.ml b/lib/json5/unescape.ml index 63e571b0..8325dbb1 100644 --- a/lib/json5/unescape.ml +++ b/lib/json5/unescape.ml @@ -66,7 +66,8 @@ let unescape str = | None -> Error (Format.sprintf "bad escape sequence %s" escape_chars) in utf_8_string_of_unicode as_int - | '\\' | '"' | 'n' | 't' -> Ok str + | '"' | '\'' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' -> Ok str + | '\\' -> Ok {|\|} | '0' -> if String.length str = 2 then Ok "\x00" else if String.length str = 4 then diff --git a/test_json5/test.ml b/test_json5/test.ml index 1b853e57..8999094e 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -1,6 +1,13 @@ module M = struct include Yojson_json5.Safe + let from_string_err s = + match from_string s with + | Ok x -> + failwith + (Format.sprintf "Test didn't failed when should: %s" (to_string x)) + | Error e -> e + let from_string s = match from_string s with | Ok t -> t @@ -37,6 +44,10 @@ let test_from_string () = Alcotest.(check yojson_json5) "more string escaping" (`String "Hello λ world") (M.from_string "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""); + Alcotest.(check string) + "string unescaped linebreak fails" "Unexpected character: " + (M.from_string_err {|"foo +bar"|}); Alcotest.(check yojson_json5) "null byte string" (`String "\x00") (M.from_string {|"\0"|}); Alcotest.(check yojson_json5) @@ -45,7 +56,10 @@ let test_from_string () = "null and octal string" (`String "\x007") (M.from_string {|"\07"|}); Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1"); Alcotest.(check yojson_json5) - "line break" (`String "foo\\\nbar") + "backslash escape" (`String {|foo\bar|}) + (M.from_string {|"foo\\bar"|}); + Alcotest.(check yojson_json5) + "line break" (`String "foobar") (M.from_string "\"foo\\\nbar\""); Alcotest.(check yojson_json5) "string and comment" (`String "bar") @@ -55,8 +69,7 @@ let test_from_string () = [ ("unquoted", `String "and you can quote me on that"); ("singleQuotes", `String "I can use \"double quotes\" here"); - ("lineBreaks", `String {|Look, Mom! \ -No \\n's!|}); + ("lineBreaks", `String {|Look, Mom! No \n's!|}); ("hexadecimal", `Int 0xdecaf); ("leadingDecimalPoint", `Float 0.8675309); ("andTrailing", `Float 8675309.0); From 59659e281495809b8442457868725b4beee0772f Mon Sep 17 00:00:00 2001 From: Daniel Hilst Date: Sun, 11 Sep 2022 10:44:34 -0300 Subject: [PATCH 15/24] Add more tests for JSON5 --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 9f467c02..826f98a9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ ## unreleased ### Added +- Added support for JSON5 ### Changed From 01e30dd44b0f5d033d24df0cb3825214263c0531 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 11:30:33 +0200 Subject: [PATCH 16/24] Update Changelog --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 826f98a9..658262c7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,7 +1,8 @@ ## unreleased ### Added -- Added support for JSON5 + +- Added support for JSON5 (@dhilst, @gorm-issuu, @gertsonderby, #152) ### Changed From 3d408701a238add7b83b7865f8892463733a9bc9 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 12:20:07 +0200 Subject: [PATCH 17/24] Extend tests and split them into their own test cases --- lib/json5/lexer.ml | 8 +-- test_json5/test.ml | 170 ++++++++++++++++++++++----------------------- 2 files changed, 86 insertions(+), 92 deletions(-) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 2ec76ba9..66bf952b 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -187,7 +187,7 @@ let string_lex_single lexbuf strbuf = lex lexbuf strbuf | _ -> lexeme lexbuf - |> Format.asprintf "Unexpected character: %s" + |> Format.sprintf "Unexpected character: %s" |> Result.error in lex lexbuf strbuf @@ -210,7 +210,7 @@ let string_lex_double lexbuf strbuf = lex lexbuf strbuf | _ -> lexeme lexbuf - |> Format.asprintf "Unexpected character: %s" + |> Format.sprintf "Unexpected character: %s" |> Result.error in lex lexbuf strbuf @@ -218,8 +218,8 @@ let string_lex_double lexbuf strbuf = let string_lex lexbuf quote = let strbuf = Buffer.create 200 in if quote = "'" then string_lex_single lexbuf strbuf - else if quote = "\"" then string_lex_double lexbuf strbuf - else Error (Format.sprintf "invalid string quote %s" quote) + else if quote = {|"|} then string_lex_double lexbuf strbuf + else Error (Format.sprintf "Invalid string quote %S" quote) let rec lex tokens buf = let lexeme = Sedlexing.Utf8.lexeme in diff --git a/test_json5/test.ml b/test_json5/test.ml index 8999094e..d8a120f8 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -14,74 +14,69 @@ module M = struct | Error e -> raise (Yojson.Json_error e) end -let yojson_json5 = Alcotest.testable M.pp M.equal +let yojson = Alcotest.testable M.pp M.equal -let test_from_string () = - Alcotest.(check yojson_json5) "Empty object" (`Assoc []) (M.from_string "{}"); - Alcotest.(check yojson_json5) "Empty list" (`List []) (M.from_string "[]"); - Alcotest.(check yojson_json5) - "List" - (`List [ `Int 1; `String "2"; `Float 3. ]) - (M.from_string "[1, \"2\", 3.0]"); - Alcotest.(check yojson_json5) "true" (`Bool true) (M.from_string "true"); - Alcotest.(check yojson_json5) "false" (`Bool false) (M.from_string "false"); - Alcotest.(check yojson_json5) "null" `Null (M.from_string "null"); - Alcotest.(check yojson_json5) - "double quotes string" (`String "hello world") - (M.from_string {|"hello world"|}); - Alcotest.(check yojson_json5) - "single quotes string" (`String "hello world") - (M.from_string {|'hello world'|}); - Alcotest.(check yojson_json5) - "float" (`Float 12345.67890) - (M.from_string "12345.67890"); - Alcotest.(check yojson_json5) "hex" (`Int 0x1) (M.from_string "0x1"); - Alcotest.(check yojson_json5) - "hex escape sequence" (`String "a") (M.from_string {|"\x61"|}); - Alcotest.(check yojson_json5) - "unicode escape sequence" (`String "λ") - (M.from_string {|"\u03bb"|}); - Alcotest.(check yojson_json5) - "more string escaping" (`String "Hello λ world") - (M.from_string "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""); - Alcotest.(check string) - "string unescaped linebreak fails" "Unexpected character: " - (M.from_string_err {|"foo -bar"|}); - Alcotest.(check yojson_json5) - "null byte string" (`String "\x00") (M.from_string {|"\0"|}); - Alcotest.(check yojson_json5) - "octal string" (`String "?") (M.from_string {|"\077"|}); - Alcotest.(check yojson_json5) - "null and octal string" (`String "\x007") (M.from_string {|"\07"|}); - Alcotest.(check yojson_json5) "int" (`Int 1) (M.from_string "1"); - Alcotest.(check yojson_json5) - "backslash escape" (`String {|foo\bar|}) - (M.from_string {|"foo\\bar"|}); - Alcotest.(check yojson_json5) - "line break" (`String "foobar") - (M.from_string "\"foo\\\nbar\""); - Alcotest.(check yojson_json5) - "string and comment" (`String "bar") - (M.from_string "\"bar\" //foo"); - let expected = - `Assoc - [ - ("unquoted", `String "and you can quote me on that"); - ("singleQuotes", `String "I can use \"double quotes\" here"); - ("lineBreaks", `String {|Look, Mom! No \n's!|}); - ("hexadecimal", `Int 0xdecaf); - ("leadingDecimalPoint", `Float 0.8675309); - ("andTrailing", `Float 8675309.0); - ("positiveSign", `Int 1); - ("trailingComma", `String "in objects"); - ("andIn", `List [ `String "arrays" ]); - ("backwardsCompatible", `String "with JSON"); - ] - in - Alcotest.(check yojson_json5) - "More elaborated" expected - (M.from_string +let parsing_test_case name expected input = + Alcotest.test_case name `Quick (fun () -> + Alcotest.check yojson name expected (M.from_string input)) + +let parsing_tests = + [ + Alcotest.test_case "Unexpected line break" `Quick (fun () -> + Alcotest.(check string) + "string unescaped linebreak fails" "Unexpected character: " + (M.from_string_err {|"foo + bar"|})); + parsing_test_case "Empty object" (`Assoc []) "{}"; + parsing_test_case "Empty list" (`List []) "[]"; + parsing_test_case "List" + (`List [ `Int 1; `String "2"; `Float 3. ]) + {|[1, "2", 3.0]|}; + parsing_test_case "true" (`Bool true) "true"; + parsing_test_case "false" (`Bool false) "false"; + parsing_test_case "null" `Null "null"; + parsing_test_case "double quotes string" (`String "hello world") + {|"hello world"|}; + parsing_test_case "single quotes string" (`String "hello world") + {|'hello world'|}; + parsing_test_case "float" (`Float 12345.67890) "12345.67890"; + parsing_test_case "hex" (`Int 0x1) "0x1"; + parsing_test_case "hex escape sequence" (`String "a") {|"\x61"|}; + parsing_test_case "unicode escape sequence" (`String "λ") {|"\u03bb"|}; + parsing_test_case "more string escaping" (`String "Hello λ world") + "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""; + parsing_test_case "null byte string" (`String "\x00") {|"\0"|}; + parsing_test_case "octal string" (`String "?") {|"\077"|}; + parsing_test_case "null and octal string" (`String "\x007") {|"\07"|}; + parsing_test_case "int" (`Int 1) "1"; + parsing_test_case "backslash escape" (`String {|foo\bar|}) {|"foo\\bar"|}; + parsing_test_case "line break" (`String "foobar") "\"foo\\\nbar\""; + parsing_test_case "string and comment" (`String "bar") "\"bar\" //foo"; + parsing_test_case "object with double quote string" + (`Assoc [ ("foo", `String "bar") ]) + {|{"foo": "bar"}|}; + parsing_test_case "object with single quote string" + (`Assoc [ ("foo", `String "bar") ]) + {|{'foo': 'bar'}|}; + parsing_test_case "object with unquoted string" + (`Assoc [ ("foo", `String "bar") ]) + {|{foo: 'bar'}|}; + (let expected = + `Assoc + [ + ("unquoted", `String "and you can quote me on that"); + ("singleQuotes", `String "I can use \"double quotes\" here"); + ("lineBreaks", `String {|Look, Mom! No \n's!|}); + ("hexadecimal", `Int 0xdecaf); + ("leadingDecimalPoint", `Float 0.8675309); + ("andTrailing", `Float 8675309.0); + ("positiveSign", `Int 1); + ("trailingComma", `String "in objects"); + ("andIn", `List [ `String "arrays" ]); + ("backwardsCompatible", `String "with JSON"); + ] + in + parsing_test_case "More elaborated" expected {|{ // comments unquoted: 'and you can quote me on that', @@ -93,27 +88,26 @@ No \\n's!", positiveSign: +1, trailingComma: 'in objects', andIn: ['arrays',], "backwardsCompatible": "with JSON", -}|}) +}|}); + ] + +let writing_test_case name expected input = + Alcotest.test_case name `Quick (fun () -> + Alcotest.(check string) name expected (M.to_string input)) -let test_to_string () = - Alcotest.(check string) "Empty object" "{}" (M.to_string (`Assoc [])); - Alcotest.(check string) "Empty list" "[]" (M.to_string (`List [])); - Alcotest.(check string) "true" "true" (M.to_string (`Bool true)); - Alcotest.(check string) "false" "false" (M.to_string (`Bool false)); - Alcotest.(check string) "null" "null" (M.to_string `Null); - Alcotest.(check string) - "string" "\"hello world\"" - (M.to_string (`String "hello world")); - Alcotest.(check string) "float" "12345.6789" (M.to_string (`Float 12345.6789)); - Alcotest.(check string) "hex" "1" (M.to_string (`Int 0x1)); - Alcotest.(check string) "int" "1" (M.to_string (`Int 1)) +let writing_tests = + [ + writing_test_case "Empty object" "{}" (`Assoc []); + writing_test_case "Empty list" "[]" (`List []); + writing_test_case "true" "true" (`Bool true); + writing_test_case "false" "false" (`Bool false); + writing_test_case "null" "null" `Null; + writing_test_case "string" "\"hello world\"" (`String "hello world"); + writing_test_case "float" "12345.6789" (`Float 12345.6789); + writing_test_case "hex" "1" (`Int 0x1); + writing_test_case "int" "1" (`Int 1); + ] -(* Run it *) let () = - let open Alcotest in - run "JSON5" - [ - ( "from_string", - [ test_case "reading from string" `Quick test_from_string ] ); - ("to_string", [ test_case "write to string" `Quick test_to_string ]); - ] + Alcotest.run "JSON5" + [ ("parsing", parsing_tests); ("writing", writing_tests) ] From afcb73801f1570443fed0b17aaad3ce85cdbcd41 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 14:39:12 +0200 Subject: [PATCH 18/24] Fix issue with quoting The lexer should return a string, no need to add double or single quotes when lexing, they are not the content of the string. --- lib/json5/ast.ml | 18 ++++-------------- lib/json5/lexer.ml | 10 ++-------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/lib/json5/ast.ml b/lib/json5/ast.ml index 8864a8fc..a1f26d57 100644 --- a/lib/json5/ast.ml +++ b/lib/json5/ast.ml @@ -7,29 +7,19 @@ type internal = | Bool of bool | Null -let strip_quotes s = String.(sub s 1 (length s - 2)) - -let safe_strip_quotes s = - if String.(get s 0 = '"' && get s (length s - 1) = '"') then strip_quotes s - else s - let rec to_basic = function - | Assoc l -> - `Assoc - (List.map (fun (name, obj) -> (safe_strip_quotes name, to_basic obj)) l) + | Assoc l -> `Assoc (List.map (fun (name, obj) -> (name, to_basic obj)) l) | List l -> `List (List.map to_basic l) - | StringLit s -> `String (strip_quotes s) + | StringLit s -> `String s | FloatLit s -> `Float (float_of_string s) | IntLit s -> `Int (int_of_string s) | Bool b -> `Bool b | Null -> `Null let rec to_safe = function - | Assoc l -> - `Assoc - (List.map (fun (name, obj) -> (safe_strip_quotes name, to_safe obj)) l) + | Assoc l -> `Assoc (List.map (fun (name, obj) -> (name, to_safe obj)) l) | List l -> `List (List.map to_safe l) - | StringLit s -> `String (strip_quotes s) + | StringLit s -> `String s | FloatLit s -> `Float (float_of_string s) | IntLit s -> ( match int_of_string_opt s with Some i -> `Int i | None -> `Intlit s) diff --git a/lib/json5/lexer.ml b/lib/json5/lexer.ml index 66bf952b..12f66b0a 100644 --- a/lib/json5/lexer.ml +++ b/lib/json5/lexer.ml @@ -170,13 +170,10 @@ let white_space = [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs] let string_lex_single lexbuf strbuf = - Buffer.add_char strbuf '\''; let lexeme = Sedlexing.Utf8.lexeme in let rec lex lexbuf strbuf = match%sedlex lexbuf with - | '\'' -> - Buffer.add_char strbuf '\''; - Ok (Buffer.contents strbuf) + | '\'' -> Ok (Buffer.contents strbuf) | '\\', escape_sequence -> let* s = Unescape.unescape (lexeme lexbuf) in Buffer.add_string strbuf s; @@ -193,13 +190,10 @@ let string_lex_single lexbuf strbuf = lex lexbuf strbuf let string_lex_double lexbuf strbuf = - Buffer.add_char strbuf '"'; let lexeme = Sedlexing.Utf8.lexeme in let rec lex lexbuf strbuf = match%sedlex lexbuf with - | '"' -> - Buffer.add_char strbuf '"'; - Ok (Buffer.contents strbuf) + | '"' -> Ok (Buffer.contents strbuf) | '\\', escape_sequence -> let* s = Unescape.unescape (lexeme lexbuf) in Buffer.add_string strbuf s; From 6514db3ae50d050eee405b79e4e006f6d569b085 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 14:50:55 +0200 Subject: [PATCH 19/24] Simplify the code with `Result.map` operators --- lib/json5/ast.ml | 6 +++--- lib/json5/let_syntax.ml | 1 + lib/json5/parser.ml | 12 ++++++------ lib/json5/read.ml | 14 +++++++------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/lib/json5/ast.ml b/lib/json5/ast.ml index a1f26d57..669ed98b 100644 --- a/lib/json5/ast.ml +++ b/lib/json5/ast.ml @@ -1,6 +1,6 @@ -type internal = - | Assoc of (string * internal) list - | List of internal list +type t = + | Assoc of (string * t) list + | List of t list | StringLit of string | IntLit of string | FloatLit of string diff --git a/lib/json5/let_syntax.ml b/lib/json5/let_syntax.ml index 83931e28..7196494f 100644 --- a/lib/json5/let_syntax.ml +++ b/lib/json5/let_syntax.ml @@ -1,3 +1,4 @@ module Result = struct let ( let* ) = Result.bind + let ( let+ ) v f = Result.map f v end diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index 815c02e0..ec7bd8c0 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -50,11 +50,11 @@ and parse = function | INT_OR_FLOAT v -> Ok (FloatLit v, xs) | STRING s -> Ok (StringLit s, xs) | OPEN_BRACKET -> - let* l, xs = parse_list [] xs in - Ok (Ast.List (List.rev l), xs) + let+ l, xs = parse_list [] xs in + (Ast.List (List.rev l), xs) | OPEN_BRACE -> - let* a, xs = parse_assoc [] xs in - Ok (Ast.Assoc (List.rev a), xs) + let+ a, xs = parse_assoc [] xs in + (Ast.Assoc (List.rev a), xs) | x -> let s = Format.asprintf "Unexpected token: %a" Lexer.pp_token x in Error s) @@ -68,8 +68,8 @@ let parse_from_lexbuf ?fname ?lnum lexbuffer = in Sedlexing.set_position lexbuffer pos; let* tokens = Lexer.lex [] lexbuffer in - let* ast = parse tokens in - Ok (fst ast) + let+ ast, _unparsed = parse tokens in + ast let parse_from_string ?fname ?lnum input = parse_from_lexbuf (Sedlexing.Utf8.from_string input) ?fname ?lnum diff --git a/lib/json5/read.ml b/lib/json5/read.ml index 4f4cacbf..bd2742c7 100644 --- a/lib/json5/read.ml +++ b/lib/json5/read.ml @@ -3,7 +3,7 @@ open Let_syntax.Result module type S = sig type t - val convert : Ast.internal -> t + val convert : Ast.t -> t end module type Out = sig @@ -21,14 +21,14 @@ module Make (F : S) : Out with type t = F.t = struct type t = F.t let from_string ?fname ?lnum input = - let* ast = Parser.parse_from_string ?fname ?lnum input in - Ok (F.convert ast) + let+ ast = Parser.parse_from_string ?fname ?lnum input in + F.convert ast let from_channel ?fname ?lnum ic = - let* ast = Parser.parse_from_channel ?fname ?lnum ic in - Ok (F.convert ast) + let+ ast = Parser.parse_from_channel ?fname ?lnum ic in + F.convert ast let from_file ?fname ?lnum file = - let* ast = Parser.parse_from_file ?fname ?lnum file in - Ok (F.convert ast) + let+ ast = Parser.parse_from_file ?fname ?lnum file in + F.convert ast end From e9760797e4b16f2f199dbb3d6a678d5f22cae995 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 16:40:40 +0200 Subject: [PATCH 20/24] Add test cases for trailing commas --- test_json5/dune | 2 +- test_json5/test.ml | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test_json5/dune b/test_json5/dune index 46b63c12..21bf5685 100644 --- a/test_json5/dune +++ b/test_json5/dune @@ -1,4 +1,4 @@ (test (name test) (package yojson-json5) - (libraries alcotest yojson_json5)) + (libraries alcotest fmt yojson_json5)) diff --git a/test_json5/test.ml b/test_json5/test.ml index d8a120f8..6de12173 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -20,6 +20,15 @@ let parsing_test_case name expected input = Alcotest.test_case name `Quick (fun () -> Alcotest.check yojson name expected (M.from_string input)) +let parsing_should_fail name input = + Alcotest.test_case name `Quick (fun () -> + (* any error message will do *) + let any_string = Alcotest.testable Fmt.string (fun _ _ -> true) in + let expected = Error "" in + Alcotest.(check (result yojson any_string)) + name expected + (Yojson_json5.Safe.from_string input)) + let parsing_tests = [ Alcotest.test_case "Unexpected line break" `Quick (fun () -> @@ -61,6 +70,16 @@ let parsing_tests = parsing_test_case "object with unquoted string" (`Assoc [ ("foo", `String "bar") ]) {|{foo: 'bar'}|}; + parsing_test_case "trailing comma in list" + (`List [ `Int 1; `Int 2; `Int 3 ]) + "[1, 2, 3,]"; + parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,]"; + parsing_should_fail "just trailing commas in list" "[,,,]"; + parsing_test_case "trailing comma in object" + (`Assoc [ ("one", `Int 1) ]) + {|{"one": 1,}|}; + parsing_should_fail "multiple trailing commas in object" {|{"one": 1,,}|}; + parsing_should_fail "just trailing commas in object" "{,,,}"; (let expected = `Assoc [ From fd07bd7755d0505e51f0427718df5c521a6fef2c Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Fri, 26 Apr 2024 16:58:06 +0200 Subject: [PATCH 21/24] Unify all tests to use `Alcotest.result` to compare --- test_json5/test.ml | 91 +++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 53 deletions(-) diff --git a/test_json5/test.ml b/test_json5/test.ml index 6de12173..a9be3e19 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -1,81 +1,66 @@ -module M = struct - include Yojson_json5.Safe - - let from_string_err s = - match from_string s with - | Ok x -> - failwith - (Format.sprintf "Test didn't failed when should: %s" (to_string x)) - | Error e -> e - - let from_string s = - match from_string s with - | Ok t -> t - | Error e -> raise (Yojson.Json_error e) -end +module M = Yojson_json5.Safe let yojson = Alcotest.testable M.pp M.equal let parsing_test_case name expected input = - Alcotest.test_case name `Quick (fun () -> - Alcotest.check yojson name expected (M.from_string input)) - -let parsing_should_fail name input = Alcotest.test_case name `Quick (fun () -> (* any error message will do *) let any_string = Alcotest.testable Fmt.string (fun _ _ -> true) in - let expected = Error "" in Alcotest.(check (result yojson any_string)) - name expected - (Yojson_json5.Safe.from_string input)) + name expected (M.from_string input)) + +let parsing_should_succeed name expected input = + parsing_test_case name (Ok expected) input + +let parsing_should_fail name input = + let failure = Error "" in + parsing_test_case name failure input let parsing_tests = [ - Alcotest.test_case "Unexpected line break" `Quick (fun () -> - Alcotest.(check string) - "string unescaped linebreak fails" "Unexpected character: " - (M.from_string_err {|"foo - bar"|})); - parsing_test_case "Empty object" (`Assoc []) "{}"; - parsing_test_case "Empty list" (`List []) "[]"; - parsing_test_case "List" + parsing_should_fail "Unexpected line break" {|"foo + bar"|}; + parsing_should_succeed "Empty object" (`Assoc []) "{}"; + parsing_should_succeed "Empty list" (`List []) "[]"; + parsing_should_succeed "List" (`List [ `Int 1; `String "2"; `Float 3. ]) {|[1, "2", 3.0]|}; - parsing_test_case "true" (`Bool true) "true"; - parsing_test_case "false" (`Bool false) "false"; - parsing_test_case "null" `Null "null"; - parsing_test_case "double quotes string" (`String "hello world") + parsing_should_succeed "true" (`Bool true) "true"; + parsing_should_succeed "false" (`Bool false) "false"; + parsing_should_succeed "null" `Null "null"; + parsing_should_succeed "double quotes string" (`String "hello world") {|"hello world"|}; - parsing_test_case "single quotes string" (`String "hello world") + parsing_should_succeed "single quotes string" (`String "hello world") {|'hello world'|}; - parsing_test_case "float" (`Float 12345.67890) "12345.67890"; - parsing_test_case "hex" (`Int 0x1) "0x1"; - parsing_test_case "hex escape sequence" (`String "a") {|"\x61"|}; - parsing_test_case "unicode escape sequence" (`String "λ") {|"\u03bb"|}; - parsing_test_case "more string escaping" (`String "Hello λ world") + parsing_should_succeed "float" (`Float 12345.67890) "12345.67890"; + parsing_should_succeed "hex" (`Int 0x1) "0x1"; + parsing_should_succeed "hex escape sequence" (`String "a") {|"\x61"|}; + parsing_should_succeed "unicode escape sequence" (`String "λ") {|"\u03bb"|}; + parsing_should_succeed "more string escaping" (`String "Hello λ world") "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""; - parsing_test_case "null byte string" (`String "\x00") {|"\0"|}; - parsing_test_case "octal string" (`String "?") {|"\077"|}; - parsing_test_case "null and octal string" (`String "\x007") {|"\07"|}; - parsing_test_case "int" (`Int 1) "1"; - parsing_test_case "backslash escape" (`String {|foo\bar|}) {|"foo\\bar"|}; - parsing_test_case "line break" (`String "foobar") "\"foo\\\nbar\""; - parsing_test_case "string and comment" (`String "bar") "\"bar\" //foo"; - parsing_test_case "object with double quote string" + parsing_should_succeed "null byte string" (`String "\x00") {|"\0"|}; + parsing_should_succeed "octal string" (`String "?") {|"\077"|}; + parsing_should_succeed "null and octal string" (`String "\x007") {|"\07"|}; + parsing_should_succeed "int" (`Int 1) "1"; + parsing_should_succeed "backslash escape" (`String {|foo\bar|}) + {|"foo\\bar"|}; + parsing_should_succeed "line break" (`String "foobar") "\"foo\\\nbar\""; + parsing_should_succeed "string and comment" (`String "bar") "\"bar\" //foo"; + parsing_should_succeed "object with double quote string" (`Assoc [ ("foo", `String "bar") ]) {|{"foo": "bar"}|}; - parsing_test_case "object with single quote string" + parsing_should_succeed "object with single quote string" (`Assoc [ ("foo", `String "bar") ]) {|{'foo': 'bar'}|}; - parsing_test_case "object with unquoted string" + parsing_should_succeed "object with unquoted string" (`Assoc [ ("foo", `String "bar") ]) {|{foo: 'bar'}|}; - parsing_test_case "trailing comma in list" + parsing_should_succeed "trailing comma in list" (`List [ `Int 1; `Int 2; `Int 3 ]) "[1, 2, 3,]"; parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,]"; parsing_should_fail "just trailing commas in list" "[,,,]"; - parsing_test_case "trailing comma in object" + parsing_should_succeed "trailing comma in object" (`Assoc [ ("one", `Int 1) ]) {|{"one": 1,}|}; parsing_should_fail "multiple trailing commas in object" {|{"one": 1,,}|}; @@ -95,7 +80,7 @@ let parsing_tests = ("backwardsCompatible", `String "with JSON"); ] in - parsing_test_case "More elaborated" expected + parsing_should_succeed "More elaborated" expected {|{ // comments unquoted: 'and you can quote me on that', From dbe0b81659f8f09f686d2a84ec07c14dabb649ac Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Thu, 30 May 2024 11:10:40 +0200 Subject: [PATCH 22/24] Fix handling of multiple trailing commas --- lib/json5/parser.ml | 4 ++-- test_json5/test.ml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/json5/parser.ml b/lib/json5/parser.ml index ec7bd8c0..1f19c825 100644 --- a/lib/json5/parser.ml +++ b/lib/json5/parser.ml @@ -2,7 +2,7 @@ open Let_syntax.Result let rec parse_list acc = function | [] -> Error "List never ends" - | Lexer.CLOSE_BRACKET :: xs | COMMA :: CLOSE_BRACKET :: xs -> Ok (acc, xs) + | Lexer.CLOSE_BRACKET :: xs -> Ok (acc, xs) | xs -> ( let* v, xs = parse xs in match xs with @@ -18,7 +18,7 @@ let rec parse_list acc = function and parse_assoc acc = function | [] -> Error "Assoc never ends" - | Lexer.CLOSE_BRACE :: xs | COMMA :: CLOSE_BRACE :: xs -> Ok (acc, xs) + | Lexer.CLOSE_BRACE :: xs -> Ok (acc, xs) | STRING k :: COLON :: xs | IDENTIFIER_NAME k :: COLON :: xs -> ( let* v, xs = parse xs in let item = (k, v) in diff --git a/test_json5/test.ml b/test_json5/test.ml index a9be3e19..98a7f856 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -58,7 +58,7 @@ let parsing_tests = parsing_should_succeed "trailing comma in list" (`List [ `Int 1; `Int 2; `Int 3 ]) "[1, 2, 3,]"; - parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,]"; + parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,,]"; parsing_should_fail "just trailing commas in list" "[,,,]"; parsing_should_succeed "trailing comma in object" (`Assoc [ ("one", `Int 1) ]) From 5a81430bbb9efc3cbe27c65ee955a0a8c44e85eb Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Thu, 30 May 2024 11:42:00 +0200 Subject: [PATCH 23/24] More tests and better argument order --- test_json5/test.ml | 120 ++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 56 deletions(-) diff --git a/test_json5/test.ml b/test_json5/test.ml index 98a7f856..f27a23ff 100644 --- a/test_json5/test.ml +++ b/test_json5/test.ml @@ -9,7 +9,7 @@ let parsing_test_case name expected input = Alcotest.(check (result yojson any_string)) name expected (M.from_string input)) -let parsing_should_succeed name expected input = +let parsing_should_succeed name input expected = parsing_test_case name (Ok expected) input let parsing_should_fail name input = @@ -20,51 +20,58 @@ let parsing_tests = [ parsing_should_fail "Unexpected line break" {|"foo bar"|}; - parsing_should_succeed "Empty object" (`Assoc []) "{}"; - parsing_should_succeed "Empty list" (`List []) "[]"; - parsing_should_succeed "List" - (`List [ `Int 1; `String "2"; `Float 3. ]) - {|[1, "2", 3.0]|}; - parsing_should_succeed "true" (`Bool true) "true"; - parsing_should_succeed "false" (`Bool false) "false"; - parsing_should_succeed "null" `Null "null"; - parsing_should_succeed "double quotes string" (`String "hello world") - {|"hello world"|}; - parsing_should_succeed "single quotes string" (`String "hello world") - {|'hello world'|}; - parsing_should_succeed "float" (`Float 12345.67890) "12345.67890"; - parsing_should_succeed "hex" (`Int 0x1) "0x1"; - parsing_should_succeed "hex escape sequence" (`String "a") {|"\x61"|}; - parsing_should_succeed "unicode escape sequence" (`String "λ") {|"\u03bb"|}; - parsing_should_succeed "more string escaping" (`String "Hello λ world") - "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\""; - parsing_should_succeed "null byte string" (`String "\x00") {|"\0"|}; - parsing_should_succeed "octal string" (`String "?") {|"\077"|}; - parsing_should_succeed "null and octal string" (`String "\x007") {|"\07"|}; - parsing_should_succeed "int" (`Int 1) "1"; - parsing_should_succeed "backslash escape" (`String {|foo\bar|}) - {|"foo\\bar"|}; - parsing_should_succeed "line break" (`String "foobar") "\"foo\\\nbar\""; - parsing_should_succeed "string and comment" (`String "bar") "\"bar\" //foo"; - parsing_should_succeed "object with double quote string" - (`Assoc [ ("foo", `String "bar") ]) - {|{"foo": "bar"}|}; - parsing_should_succeed "object with single quote string" - (`Assoc [ ("foo", `String "bar") ]) - {|{'foo': 'bar'}|}; - parsing_should_succeed "object with unquoted string" - (`Assoc [ ("foo", `String "bar") ]) - {|{foo: 'bar'}|}; - parsing_should_succeed "trailing comma in list" - (`List [ `Int 1; `Int 2; `Int 3 ]) - "[1, 2, 3,]"; - parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,,]"; - parsing_should_fail "just trailing commas in list" "[,,,]"; - parsing_should_succeed "trailing comma in object" - (`Assoc [ ("one", `Int 1) ]) - {|{"one": 1,}|}; + parsing_should_succeed "true" "true" (`Bool true); + parsing_should_succeed "false" "false" (`Bool false); + parsing_should_succeed "null" "null" `Null; + parsing_should_succeed "double quotes string" {|"hello world"|} + (`String "hello world"); + parsing_should_succeed "single quotes string" {|'hello world'|} + (`String "hello world"); + parsing_should_succeed "float" "12345.67890" (`Float 12345.67890); + parsing_should_succeed "hex" "0x1" (`Int 0x1); + parsing_should_succeed "hex escape sequence" {|"\x61"|} (`String "a"); + parsing_should_succeed "unicode escape sequence" {|"\u03bb"|} (`String "λ"); + parsing_should_succeed "more string escaping" + "\"Hello \\u03bb \\x77\\x6F\\x72\\x6C\\x64\"" (`String "Hello λ world"); + parsing_should_succeed "null byte string" {|"\0"|} (`String "\x00"); + parsing_should_succeed "octal string" {|"\077"|} (`String "?"); + parsing_should_succeed "null and octal string" {|"\07"|} (`String "\x007"); + parsing_should_succeed "int" "1" (`Int 1); + parsing_should_succeed "backslash escape" {|"foo\\bar"|} + (`String {|foo\bar|}); + parsing_should_succeed "line break" "\"foo\\\nbar\"" (`String "foobar"); + parsing_should_succeed "string and comment" "\"bar\" //foo" (`String "bar"); + (* objects *) + parsing_should_succeed "empty object" "{}" (`Assoc []); + parsing_should_succeed "object with double quote string" {|{"foo": "bar"}|} + (`Assoc [ ("foo", `String "bar") ]); + parsing_should_succeed "object with single quote string" {|{'foo': 'bar'}|} + (`Assoc [ ("foo", `String "bar") ]); + parsing_should_succeed "object with unquoted string" {|{foo: 'bar'}|} + (`Assoc [ ("foo", `String "bar") ]); + parsing_should_succeed "trailing comma in object" {|{"one": 1,}|} + (`Assoc [ ("one", `Int 1) ]); + parsing_should_succeed "colon in key" {|{"colon:": 1}|} + (`Assoc [ ("colon:", `Int 1) ]); parsing_should_fail "multiple trailing commas in object" {|{"one": 1,,}|}; + parsing_should_fail "just trailing comma in object" "{,}"; parsing_should_fail "just trailing commas in object" "{,,,}"; + parsing_should_fail "multiple colons in object" {|{one :: 1}|}; + parsing_should_fail "newline in key" {|{new\nline: 1}|}; + (* lists *) + parsing_should_succeed "empty list" "[]" (`List []); + parsing_should_succeed "heterogenous list" {|[1, "2", 3.0]|} + (`List [ `Int 1; `String "2"; `Float 3. ]); + parsing_should_succeed "trailing comma in list" "[1, 2, 3,]" + (`List [ `Int 1; `Int 2; `Int 3 ]); + parsing_should_succeed "trailing comma with space list" "[1, 2, 3, ]" + (`List [ `Int 1; `Int 2; `Int 3 ]); + parsing_should_succeed "newlines in list" "[1, 2\n, 3]" + (`List [ `Int 1; `Int 2; `Int 3 ]); + parsing_should_fail "multiple trailing commas in list" "[1, 2, 3,,]"; + parsing_should_fail "just trailing comma in list" "[,]"; + parsing_should_fail "multiple trailing commas in list" "[,,,]"; + (* all together *) (let expected = `Assoc [ @@ -80,7 +87,7 @@ let parsing_tests = ("backwardsCompatible", `String "with JSON"); ] in - parsing_should_succeed "More elaborated" expected + parsing_should_succeed "More elaborated" {|{ // comments unquoted: 'and you can quote me on that', @@ -92,24 +99,25 @@ No \\n's!", positiveSign: +1, trailingComma: 'in objects', andIn: ['arrays',], "backwardsCompatible": "with JSON", -}|}); +}|} + expected); ] -let writing_test_case name expected input = +let writing_test_case name input expected = Alcotest.test_case name `Quick (fun () -> Alcotest.(check string) name expected (M.to_string input)) let writing_tests = [ - writing_test_case "Empty object" "{}" (`Assoc []); - writing_test_case "Empty list" "[]" (`List []); - writing_test_case "true" "true" (`Bool true); - writing_test_case "false" "false" (`Bool false); - writing_test_case "null" "null" `Null; - writing_test_case "string" "\"hello world\"" (`String "hello world"); - writing_test_case "float" "12345.6789" (`Float 12345.6789); - writing_test_case "hex" "1" (`Int 0x1); - writing_test_case "int" "1" (`Int 1); + writing_test_case "Empty object" (`Assoc []) "{}"; + writing_test_case "Empty list" (`List []) "[]"; + writing_test_case "true" (`Bool true) "true"; + writing_test_case "false" (`Bool false) "false"; + writing_test_case "null" `Null "null"; + writing_test_case "string" (`String "hello world") "\"hello world\""; + writing_test_case "float" (`Float 12345.6789) "12345.6789"; + writing_test_case "hex" (`Int 0x1) "1"; + writing_test_case "int" (`Int 1) "1"; ] let () = From 1df1df5483a8cc25247d53215de8cf7d686f8824 Mon Sep 17 00:00:00 2001 From: Marek Kubica Date: Thu, 30 May 2024 11:56:50 +0200 Subject: [PATCH 24/24] Do not depend on `fmt` --- test_json5/dune | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_json5/dune b/test_json5/dune index 21bf5685..46b63c12 100644 --- a/test_json5/dune +++ b/test_json5/dune @@ -1,4 +1,4 @@ (test (name test) (package yojson-json5) - (libraries alcotest fmt yojson_json5)) + (libraries alcotest yojson_json5))