From 305b3fcae6d8dfb69ad16820844596b155a3ad9c Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 3 Apr 2018 17:03:24 +0100 Subject: [PATCH 1/2] SAX Tests for encoding and BOMs Try reading two files with Byte Order Marks. test_sax_fsm_1_utf8_bom.in is UTF8 encoded (but this is not mentioned in the XML header). We should be able to read this as long as we don't trip up with the BOM. test_sax_fsm_1_utf16_bom.in is UTF16 encoded with a BOM and encoding declared in the XML. We should not be able to read this (we should get a non-well-formed error). --- sax/test/test_sax_fsm_1_utf16_bom.in | Bin 0 -> 124 bytes sax/test/test_sax_fsm_1_utf8_bom.in | 2 ++ sax/test/test_sax_parser_6.f90 | 15 +++++++++++++++ sax/test/test_sax_parser_6.out | 1 + sax/test/test_sax_parser_7.f90 | 15 +++++++++++++++ sax/test/test_sax_parser_7.out | 3 +++ 6 files changed, 36 insertions(+) create mode 100644 sax/test/test_sax_fsm_1_utf16_bom.in create mode 100644 sax/test/test_sax_fsm_1_utf8_bom.in create mode 100644 sax/test/test_sax_parser_6.f90 create mode 100644 sax/test/test_sax_parser_6.out create mode 100644 sax/test/test_sax_parser_7.f90 create mode 100644 sax/test/test_sax_parser_7.out diff --git a/sax/test/test_sax_fsm_1_utf16_bom.in b/sax/test/test_sax_fsm_1_utf16_bom.in new file mode 100644 index 0000000000000000000000000000000000000000..f2b770618810f6336d00c720a74393d82556d3f1 GIT binary patch literal 124 zcmW-ZNeX~K3 + diff --git a/sax/test/test_sax_parser_6.f90 b/sax/test/test_sax_parser_6.f90 new file mode 100644 index 00000000..f14b8ad4 --- /dev/null +++ b/sax/test/test_sax_parser_6.f90 @@ -0,0 +1,15 @@ +program test_sax_reader + + use FoX_sax + type(xml_t) :: xp + integer :: iostat + + call open_xml_file(xp, "test_sax_fsm_1_utf8_bom.in", iostat) + + write(*,'(i1)') iostat + + call parse(xp) + + call close_xml_t(xp) + +end program test_sax_reader diff --git a/sax/test/test_sax_parser_6.out b/sax/test/test_sax_parser_6.out new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/sax/test/test_sax_parser_6.out @@ -0,0 +1 @@ +0 diff --git a/sax/test/test_sax_parser_7.f90 b/sax/test/test_sax_parser_7.f90 new file mode 100644 index 00000000..9f181a7a --- /dev/null +++ b/sax/test/test_sax_parser_7.f90 @@ -0,0 +1,15 @@ +program test_sax_reader + + use FoX_sax + type(xml_t) :: xp + integer :: iostat + + call open_xml_file(xp, "test_sax_fsm_1_utf16_bom.in", iostat) + + write(*,'(i1)') iostat + + call parse(xp) + + call close_xml_t(xp) + +end program test_sax_reader diff --git a/sax/test/test_sax_parser_7.out b/sax/test/test_sax_parser_7.out new file mode 100644 index 00000000..2ccae3f2 --- /dev/null +++ b/sax/test/test_sax_parser_7.out @@ -0,0 +1,3 @@ +0 +ERROR(FoX) +Illegal character found at test_sax_fsm_1_utf16_bom.in:0:3 Error parsing XML declaration (Possibly near line=0 col=3) From 9d8e37e805065cc04dabf63841491d0aba1cff74 Mon Sep 17 00:00:00 2001 From: Antoine Langenfeld Date: Sun, 8 Jan 2017 09:42:04 +0000 Subject: [PATCH 2/2] Don't fail if BOM is present For a UTF8-encoded XML file with a Byte Order Mark and characters that are also ascii characters, we should be able to read the file. If the first character is not- recognisable assume we are dealing with a BOM, skip it, and carry on. We'll then either read the file OK or we end up with something that is not well-formed (e.g. because it is a different encoding). --- sax/m_sax_xml_source.F90 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sax/m_sax_xml_source.F90 b/sax/m_sax_xml_source.F90 index 21c238e6..5b668a80 100644 --- a/sax/m_sax_xml_source.F90 +++ b/sax/m_sax_xml_source.F90 @@ -205,6 +205,11 @@ subroutine parse_declaration(f, eof, es, standalone) ch => null() do c = get_char_from_file(f, XML1_0, eof, es) + ! If we don't have a sensible character it may be a Byte + ! Order Mark. Skip it and carry on to see if we can still + ! read. + if (parse_state == XD_0 .and. c /= '<') & + cycle if (eof) then call rewind_source(f) exit