/****************************************************************************** * This file is part of The Unicode Tools Of Rexx (TUTOR) * * See https://rexx.epbcn.com/TUTOR/ * * and https://github.com/JosepMariaBlasco/TUTOR * * Copyright © 2023-2025 Josep Maria Blasco * * License: Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0) * ******************************************************************************/ -- Please refer to docs/new-functions.md for documentation and additional details. -- -- Version 0.4b, 20230925 -- -- Notice: -- ------- -- -- Although this routine is part of TUTOR, The Unicode Tools Of Rexx, -- it can also be used separately, as it has no dependencies on the rest -- of components of TUTOR. -- ::Routine UTF8 Public -- string -- The string to validate/convert -- format -- Format of "string" (one of UTF-8, UTF-8Z, WTF-8, CESU-8, MUTF-8; default: UTF-8) -- target -- We want "string" converted to the "target" encodings. -- -- Either not specified or "" (the default), don't do any conversion; -- -- or (one of UTF-8, UTF-32, or both), -- -- or (one of WTF-8, WTF-32, or both). -- errorhandling -- What to do if we find ill-formed character sequences -- -- One of ("NULL" or "" [the default], return a null string; "REPLACE", -- -- replace the ill-formed sequence with the Unicode Replacement Character, -- -- or "SYNTAX", raise a Syntax error). -- -- It is an error to specify "errorHandling" when "target" was not specified. Use Strict Arg string, format = "UTF-8", target = "", errorHandling = "" -- One-time initialization. Fill the TRANSLATE tables If .Unicode.UTF8.Initialized \== 1 Then Call InitializeTranslateTables ------------------------------------------------------------------------------ -- Check that the supplied arguments are OK, -- -- and raise a Syntax condition if not -- ------------------------------------------------------------------------------ -- Validate and normalize "format" format = Strip(Upper(format)) Select Case format When "CESU-8", "CESU8" Then format = "CESU-8" When "MUTF-8", "MUTF8" Then format = "MUTF-8" When "UTF-8", "UTF8", "" Then format = "UTF-8" When "UTF-8Z", "UTF8Z" Then format = "UTF-8Z" When "WTF-8", "WTF8" Then format = "WTF-8" Otherwise Raise Syntax 93.900 Additional("Invalid format '"format"'") End -- Lone surrogates are ill-formed for UTF-8 and UTF-8Z If format \== "UTF-8", format \== "UTF-8Z" Then allowLoneSurrogates = 1 -- Wait to see what are the formats Else allowLoneSurrogates = -1 -- We are in validation mode only when no target format has been specified validate = target == "" -- Check that "target" is OK targets = 0 -- Number of targets (0, 1 or 2) return8 = 0 -- Return UTF-8 or WTF-8 return32 = 0 -- Return UTF-32 or WTF-32 target = Space(Upper(target)) Do target Over target~makeArray(" ") targets += 1 Select Case target When "UTF-8", "UTF8", "WTF-8", "WTF8" Then return8 = 1 When "UTF-32", "UTF32", "WTF-32", "WTF32" Then return32 = 1 Otherwise Raise Syntax 93.900 Additional("Invalid target '"target"'") End Select Case target When "UTF-8", "UTF8", "UTF-32", "UTF32" Then If allowLoneSurrogates == -1 Then allowLoneSurrogates = 0 Else If allowLoneSurrogates = 1 Then Raise Syntax 93.900 Additional("Conflicting target '"Space(Upper(Arg(3)))"' and format '"format"'") When "WTF-8", "WTF8", "WTF-32", "WTF32" Then If allowLoneSurrogates == -1 Then allowLoneSurrogates = 1 Else If allowLoneSurrogates = 0 Then Raise Syntax 93.900 Additional("Conflicting target '"Space(Upper(Arg(3)))"' and format '"format"'") End End -- It's an error to specify errorHandling when target was not specified If target == "", errorHandling \== "" Then Raise Syntax 93.900 Additional("Invalid option '"errorHandling"'") -- Check that "errorHandling" is OK. Select Case Strip(Upper(errorHandling)) When "NULL", "" Then errorHandling = "NULL" When "SYNTAX" Then errorHandling = "SYNTAX" When "REPLACE" Then errorHandling = "REPLACE" Otherwise Raise Syntax 93.900 Additional("Invalid error handling '"errorHandling"'") End repl = errorHandling == "REPLACE" syntax = errorHandling == "SYNTAX" null = errorHandling == "NULL" ------------------------------------------------------------------------------ -- Null strings decode to the null string. If string == "" Then Signal StringIsEmpty string = string~makeString -- Demote to pure .String to avoid possible loops -- Build the "states" string. We will work in parallel with "string" and "states" states = Translate(string, .local["Unicode."format".tableo"], .local["Unicode."format".tablei"]) If return8 Then buffer8 = .MutableBuffer~new If return32 Then buffer32 = .MutableBuffer~new i = 0 length = Length(string) ContinueScan: i += 1 -- Did we scan the whole string without errors? The string is good. If i > length Then Signal StringIsGood -- -- VALID 1-BYTE SEQUENCES (i.e., ASCII, or ASCII - "00"U) -- save = i If OnlyASCIIsLeft() Then Do -- If the rest of the string is composed only of ASCII characters, then -- the string is good. Copy all ASCIIs left, and return. Do i = i to length If return8 Then buffer8 ~append( string[i] ) If return32 Then buffer32~append( "000000"x, string[i] ) End Signal StringIsGood End -- Only some (>= 0) chars were ASCII. Copy them. Do j = save to i - 1 If return8 Then buffer8 ~append( string[j] ) If return32 Then buffer32~append( "000000"x, string[j] ) End -- -- HANDLE NON-ASCII CHARACTERS -- state = states[i] Signal (state) -- Fire the FSM -- -- ILLEGAL CHARS -- -- An Illegal character is an error, -- and a lone Continuation character is also an error. "I": "C": Signal 1Error -- -- TWO-BYTE SEQUENCES -- "20"X: -- We want a two-byte sequence If states[i+1] \== "C" Then Signal 1Error If return32 Then Do y = Right(X2B(C2X(string[i ])), 5) x = Right(X2B(C2X(string[i+1])), 6) buffer32~append( "0000"X , X2C(B2X("00000"||y||x)) ) End If return8 Then buffer8~append(string[i,2]) i += 1 Signal ContinueScan "0": -- UTF8-Z and MUTF-8: C080 --> "0000"U, ill-formed otherwise If string[i+1] \== "80"X Then Signal 1Error If return8 Then buffer8 ~append( "00"X ) If return32 Then buffer32~append( "0000 0000"X ) i += 1 Signal ContinueScan -- -- THREE-BYTE SEQUENCES -- "3a"X: "3b"X: "3c"X: "3d"X: "3e"X: -- We need exactly a three-byte sequence If states[i+1] \== "C" Then Signal 1Error If state == "3a"X, string[i+1] < "A0"X Then Signal 1Error If state == "3c"X, string[i+1] > "9F"X Then Signal 1Error If state \== "3e"X, states[i+2] \== "C" Then Signal 2Error If state == "3d"X Then Do -- In WTF-8, surrogate pairs are ill-formed If states[i+3,3] == "3d"X"CC", - string[i+1] >= "A0"X, string[i+1] <= "AF"X, - string[i+4] >= "B0"X, string[i+4] <= "BF"X Then Signal 6Error -- Lone surrogates aren't well-formed in some encodings If string[i+1] >= "A0"X, \ allowLoneSurrogates Then Signal 3Error End If state == "3e"X Then Signal CESU8 3OK: If return32 Then Do z = Right(X2B(C2X(string[i ])), 4) y = Right(X2B(C2X(string[i+1])), 6) x = Right(X2B(C2X(string[i+2])), 6) If return32 Then buffer32~append( "0000"X, X2C(B2X(z||y||x)) ) End If return8 Then buffer8~append(string[i,3]) i = i + 2 Signal ContinueScan CESU8: -- See https://en.wikipedia.org/wiki/CESU-8: -- -- "Though not specified in the technical report, unpaired surrogates are also encoded as 3 bytes each, -- and CESU-8 is exactly the same as applying an older UCS-2 to UTF-8 converter to UTF-16 data." -- -- We know that string[i] == "ED"X and that states[i+1] == "C" If states[i+2] \== "C" Then Signal 2Error Select When string[i+1] <= "9F"X Then Signal 3OK -- Standard UTF-8 three-byte sequence When string[i+1] >= "B0"X Then Signal 3OK -- Lone trail surrogate Otherwise -- A0..AF: lead surrogate If states[i+2] \== "C" Then Signal 2Error If string[i+3] \== "ED"X Then Signal 3OK -- Lone lead surrogate If states[i+4] \== "C" Then Signal 3OK If states[i+5] \== "C" Then Signal 3OK If string[i+4] < "B0"X Then Signal 3OK -- Not a trail surrogate If string[i+4] > "BF"X Then Signal 3OK -- Not a trail surrogate -- From https://en.wikipedia.org/wiki/CESU-8 -- The encoding of Unicode non-BMP characters works out to -- 11101101 1010yyyy 10xxxxxx 11101101 1011xxxx 10xxxxxx (yyyy represents the top five bits of the character minus one). a = Right(X2B(C2X(string[i+1])), 4) b = Right(X2B(C2X(string[i+2])), 6) c = Right(X2B(C2X(string[i+4])), 4) d = Right(X2B(C2X(string[i+5])), 6) a = X2B(D2X( X2D(B2X(a)) + 1 )) b = Right(a || b || c || d, 21, 0) scalar = Right(X2C(B2X(b)),4,"00"X) End If return32 Then buffer32~append( scalar ) If return8 Then Do -- Unicode Standard, table 3-6. -- 000uuuuu zzzzyyyy yyxxxxxx --> 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx Parse var b uuu 4 uuzzzz 10 yyyyyy 16 xxxxxx buffer8~append( X2C( B2X("11110"uuu"10"uuzzzz"10"yyyyyy"10"xxxxxx) ) ) End i = i + 5 Signal ContinueScan -- -- FOUR-BYTE SEQUENCES -- "4a"X: "4b"X: "4c"X: -- We need a four-byte sequence. No continuation, or only one or two -- continuations are errors. If states[i+1] \== "C" Then Signal 1Error Select Case state When "4b"X Then Nop When "4a"X Then If string[i+1] < "90"X Then Signal 1Error When "4c"X Then If string[i+1] > "8F"X Then Signal 1Error End If states[i+2] \== "C" Then Signal 2Error If states[i+3] \== "C" Then Signal 3Error If return32 Then Do u = Right(X2B(C2X(string[i ])), 3) z = Right(X2B(C2X(string[i+1])), 6) y = Right(X2B(C2X(string[i+2])), 6) x = Right(X2B(C2X(string[i+3])), 6) buffer32~append( "00"X, X2C(B2X(Right(u||z||y||x,24,0))) ) End If return8 Then buffer8~append(string[i,4]) i = i + 3 Signal ContinueScan -- 1 character is in error 1Error: If validate Then Return String(0) If null Then Return String("") errorSequence = string[i] If syntax Then Signal Syntax Call ReplaceCharacter Signal ContinueScan -- 2 characters are in error 2Error: If validate Then Return String(0) If null Then Return String("") errorSequence = string[i,2] If syntax Then Signal Syntax Call ReplaceCharacter i += 1 Signal ContinueScan -- 3 characters are in error 3Error: If validate Then Return String(0) If null Then Return String("") errorSequence = string[i,3] If syntax Then Signal Syntax Call ReplaceCharacter i += 2 Signal ContinueScan -- 6 characters are in error (WTF-8 surrogate pair sequence) 6Error: If validate Then Return String(0) If null Then Return String("") errorSequence = string[i,6] If syntax Then Signal Syntax Call ReplaceCharacter Call ReplaceCharacter -- Two replacement characters: it was a pair i += 5 Signal ContinueScan ReplaceCharacter: If return8 Then buffer8 ~append( "efbfbd"X ) If return32 Then buffer32~append( "0000FFFD"X ) Return OnlyASCIIsLeft: pos = Verify(states, "A", "N", i) If pos == 0 Then Return 1 -- Only ASCII characters left, string is good. i = pos -- Set the new i Return 0 StringIsEmpty: If validate Then Return String(1) null = String("") -- Only one target? Return a string If targets == 1 Then Return null -- Several targets. Return a stem. s. = .Stem~new() If allowLoneSurrogates Then Do s.wtf8 = null s.wtf32 = null End Else Do s.utf8 = null s.utf32 = null End Return s. StringIsGood: If validate Then Return String(1) -- Only one target? Return a string If targets == 1 Then If return8 Then Return String(buffer8 ~string) Else Return String(buffer32~string) -- Several targets. Return a stem. s. = .Stem~new() If allowLoneSurrogates Then Do s.wtf8 = String(buffer8 ~string) s.wtf32 = String(buffer32 ~string) End Else Do s.utf8 = String(buffer8 ~string) s.utf32 = String(buffer32 ~string) End Return s. Syntax: Raise Syntax 23.900 Additional("Invalid" format "sequence in position" i "of string: '"C2X(errorSequence)"'X") String: If .Unicode.UTF8.Bytes Then Return .Bytes~new(Arg(1)) Return Arg(1) -------------------------------------------------------------------------------- -- One time only: build the TRANSLATE tables for each supported encoding -- -------------------------------------------------------------------------------- ::Routine InitializeTranslateTables .local~Unicode.UTF8.Initialized = 1 -- See whether we should return .Bytes (when Unicode.cls has been loaded) -- or .String (for standalone use) -- -- We check that .Bytes is a class and that .Bytes subclasses .String bytes = 0 If .Bytes~isA(.Class), .Bytes~isSubclassOf(.String) Then bytes = 1 .local~Unicode.UTF8.Bytes = bytes ASCII = "A" -- -- UTF8 -- -- The Unicode® Standard. Version 15.0 – Core Specification -- https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf, -- table 3-7, p. 135. tablei = "" tableo = "" Call AssignCharacters "00","7F",ASCII -- ASCII Chars Call AssignCharacters "80","BF","C" -- Continuation character Call AssignCharacters "C0","C1","I" -- Illegal character Call AssignCharacters "C2","DF","20"X -- 2-byte sequences (1 continuation) Call AssignCharacters "E0","E0","3a"X -- 3-byte sequences of subtype "a": 2nd byte in A0..BF. Manual check. Call AssignCharacters "E1","EC","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "ED","ED","3c"X -- 3-byte sequences of subtype "c". 2nd byte in 80..9F. Manual check. Call AssignCharacters "EE","EF","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "F0","F0","4a"X -- 4-byte sequences of subtype "a". 2nd byte in 90..BF. Manual check. Call AssignCharacters "F1","F3","4b"X -- 4-byte sequences of subtype "b". 3 continuations. Call AssignCharacters "F4","F4","4c"X -- 4-byte sequences of subtype "c". 2nd byte in 80..8F. Manual check. Call AssignCharacters "F5","FF","I" -- Illegal character .local["Unicode.UTF-8.tablei"] = tablei .local["Unicode.UTF-8.tableo"] = tableo -- UTF8Z -- -- UTF-8Z is identical to UTF-8, except for the way in which the null character ("00"U) is encoded. -- -- UTF-8Z encodes "00"U using the overlong encoding "C080"X. -- "00"X never appears in a UTF-8Z encoded string, -- and "C0"X continues to be invalid, unless immediately followed by "00"X. tablei = "" tableo = "" Call AssignCharacters "00","00","I" -- "00"X is illegal in UTF8Z Call AssignCharacters "01","7F",ASCII -- ASCII Chars Call AssignCharacters "80","BF","C" -- Continuation character Call AssignCharacters "C0","C0","0" -- C080 --> "0000"U; C0xx --> illegal Call AssignCharacters "C1","C1","I" -- Illegal character Call AssignCharacters "C2","DF","20"X -- 2-byte sequences (1 continuation) Call AssignCharacters "E0","E0","3a"X -- 3-byte sequences of subtype "a": 2nd byte in A0..BF. Manual check. Call AssignCharacters "E1","EC","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "ED","ED","3c"X -- 3-byte sequences of subtype "c". 2nd byte in 80..9F. Manual check. Call AssignCharacters "EE","EF","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "F0","F0","4a"X -- 4-byte sequences of subtype "a". 2nd byte in 90..BF. Manual check. Call AssignCharacters "F1","F3","4b"X -- 4-byte sequences of subtype "b". 3 continuations. Call AssignCharacters "F4","F4","4c"X -- 4-byte sequences of subtype "c". 2nd byte in 80..8F. Manual check. Call AssignCharacters "F5","FF","I" -- Illegal character .local["Unicode.UTF-8Z.tablei"] = tablei .local["Unicode.UTF-8Z.tableo"] = tableo -- WTF8 -- -- See The WTF-8 Encoding, https://simonsapin.github.io/wtf-8/, -- table 3, which is a variant of UTF-8 table 3-7. tablei = "" tableo = "" Call AssignCharacters "00","7F",ASCII -- ASCII Chars Call AssignCharacters "80","BF","C" -- Continuation character Call AssignCharacters "C0","C1","I" -- Illegal character Call AssignCharacters "C2","DF","20"X -- 2-byte sequences (1 continuation) Call AssignCharacters "E0","E0","3a"X -- 3-byte sequences of subtype "a": 2nd byte in A0..BF. Manual check. Call AssignCharacters "E1","EC","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "ED","ED","3d"X -- 3-byte sequences of subtype "d". 2nd byte in 80..9F, normal char; in A0..AF, lead surrogate; in B0..BF, trail surrogate Call AssignCharacters "EE","EF","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "F0","F0","4a"X -- 4-byte sequences of subtype "a". 2nd byte in 90..BF. Manual check. Call AssignCharacters "F1","F3","4b"X -- 4-byte sequences of subtype "b". 3 continuations. Call AssignCharacters "F4","F4","4c"X -- 4-byte sequences of subtype "c". 2nd byte in 80..8F. Manual check. Call AssignCharacters "F5","FF","I" -- Illegal character .local["Unicode.WTF-8.tablei"] = tablei .local["Unicode.WTF-8.tableo"] = tableo -- CESU-8 -- -- See Unicode Technical Report #26. COMPATIBILITY ENCODING SCHEME FOR UTF-16: 8-BIT (CESU-8) -- https://www.unicode.org/reports/tr26/tr26-4.html tablei = "" tableo = "" Call AssignCharacters "00","7F",ASCII -- ASCII Chars Call AssignCharacters "80","BF","C" -- Continuation character Call AssignCharacters "C0","C1","I" -- Illegal character Call AssignCharacters "C2","DF","20"X -- 2-byte sequences (1 continuation) Call AssignCharacters "E0","E0","3a"X -- 3-byte sequences of subtype "a": 2nd byte in A0..BF. Manual check. Call AssignCharacters "E1","EC","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "ED","ED","3e"X -- 3-byte sequences of subtype "e". 2nd byte in 80..9F, normal char; -- in A0..AF, lead surrogate of a possible pair; in B0..BF, trail surrogate (ill-formed) Call AssignCharacters "EE","EF","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "F0","FF","I" -- Illegal character .local["Unicode.CESU-8.tablei"] = tablei .local["Unicode.CESU-8.tableo"] = tableo -- MUTF-8 -- -- See https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 -- See https://docs.oracle.com/javase/specs/jvms/se16/html/jvms-4.html#jvms-4.4.7 tablei = "" tableo = "" Call AssignCharacters "00","00","I" -- "00"X is illegal in UTF8Z Call AssignCharacters "01","7F",ASCII -- ASCII Chars Call AssignCharacters "80","BF","C" -- Continuation character Call AssignCharacters "C0","C0","0" -- C080 --> "0000"U; C0xx --> illegal Call AssignCharacters "C1","C1","I" -- Illegal character Call AssignCharacters "C2","DF","20"X -- 2-byte sequences (1 continuation) Call AssignCharacters "E0","E0","3a"X -- 3-byte sequences of subtype "a": 2nd byte in A0..BF. Manual check. Call AssignCharacters "E1","EC","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "ED","ED","3e"X -- 3-byte sequences of subtype "e". 2nd byte in 80..9F, normal char; -- in A0..AF, lead surrogate of a possible pair; in B0..BF, trail surrogate (ill-formed) Call AssignCharacters "EE","EF","3b"X -- 3-byte sequences of subtype "b". 2 continuations. Call AssignCharacters "F0","FF","I" -- Illegal character .local["Unicode.MUTF-8.tablei"] = tablei .local["Unicode.MUTF-8.tableo"] = tableo Return AssignCharacters: use Arg from, to, value input = XRange(X2C(from), X2C(to)) tablei ||= input tableo ||= Copies(value,Length(input)) Return