Byte Loss in String-Literal Concatenation

Background

Last week a lively discussion in CnPack’s WeChat group revolved around this Delphi program:

// Compile with code page 936
program Problem;

const
  strPublicKey: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$29#$A9 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$36#$B9#$23#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
begin
  Writeln(Length(strPublicKey)); // expected 62 got 58 - why?
  Readln;
end.

The reporter noted that it worked in Delphi 7 when using AnsiString. Some observations:

The same result appears if RawByteString is changed to AnsiString.
If the four literals are merged into one long literal (no +), the length is correct.
Liu Xiao said that dump of strPublicKey contains some ?, showing that some bytes were replaced.
The output is 62 when adding explicit type cast AnsiString(#$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05)

I also noticed the dcc warning in Delphi 12.3, but it disappeared when I changed RawByteString to AnsiString. It is strange because it also causes potential data loss from string to AnsiString. (I leave this part for future.)

[dcc32 Warning] W1058 Implicit string cast with potential data loss from 'string' to 'RawByteString'

I was curious about the data-loss issue and decided to investigate it.

Note: Generally, the correct approach is to use a byte array to represent binary data, since strings are intended for textual content. You may also skip the analysis and jump directly to the Conclusion section.

A Minimal Repro

I originally suspected that the data loss occurred during string conversion triggered by the + operator, so I wrote a simpler program to reproduce the issue.

// Compile with code page 936
program DataLoss;

{$APPTYPE CONSOLE}
{$R *.res}

const
  s1: RawByteString = #$41#$42 + #$B4#$29#$0D#$0A;

begin
  Writeln(Length(s1));  // Expected 6, got 5.
  Readln;
end.

It now looks as though the second string literal is being promoted from an AnsiString to a UnicodeString. Because the byte sequence #$B4#$29 is invalid in code page 936 (CP936), the compiler replaces it with a single question mark (?, 0x3F). Consequently, the resulting string is only 5 bytes long instead of 6.

Code page 936:
Lead byte: #$81..#$FE
Trail byte: #$40..#$FE, excluding #$7F

OK. I also tried adding explicit type cast:

const s1: RawByteString = AnsiString(#$41#$42) + #$B4#$29#$0D#$0A;
// output 6

const s1: RawByteString = #$41#$42 + AnsiString(#$B4#$29#$0D#$0A);
// output 5

It seems that #$41#$42 is treated as a UnicodeString. I thought the type of + expression depends on the left operand. I posted a thread in Embarcadero’s private Beta forum.

Diagnostic Code

Bruneau (Embarcadero) reproduced the issue with the following rigorous diagnostic code:

// provided by Bruneau
program DataLoss;

{$APPTYPE CONSOLE}
{$R *.res}

uses System.SysUtils, System.Win.Crtl;

const
{$IF DEFINED(CAST_ANSI_FIRST)}
  s1: RawByteString = AnsiString(#$41#$42) + #$B4#$29#$0D#$0A;
  msg = 'CAST_ANSI_FIRST';
{$ELSEIF DEFINED(CAST_ANSI_SECOND)}
  s1: RawByteString = #$41#$42 + AnsiString(#$B4#$29#$0D#$0A);
  msg = 'CAST_ANSI_SECOND';
{$ELSE}
  s1: RawByteString = #$41#$42 + #$B4#$29#$0D#$0A;
  msg = 'USE_RAWBYTESTRING';
{$ENDIF}

function DumpStr(const AStr: RawByteString): string;
begin
  Result := '';
  for var I := Low(AStr) to High(AStr) do
  begin
    var ch: AnsiChar := AStr[I];
    var s: string;
    if isprint(Integer(ch)) <> 0 then
      s := Format('''%s''', [ch])
    else 
      s := Format('$%0.2x', [Integer(ch)]);
    Result := Format('%s%-5s', [Result, s]);
  end;
end;

procedure Main;
begin
  Writeln(Format('%-17s: %-35s Len:%d', [msg, DumpStr(s1), Length(s1)]));
end;

begin
  Main;
end.

He also noted that all three flavors worked when change #$B4#$29 to valid pair like #$B4#$D0.

As Delphi have no formal specification, he debugged the delphi compiler and saw that the expression type is the same as left operand when dcc handling the + operator in previous two cases:

const s1: RawByteString = AnsiString(#$41#$42) + #$B4#$29#$0D#$0A;
const s1: RawByteString = #$41#$42 + AnsiString(#$B4#$29#$0D#$0A);

UweRaabe raised a question about whether the directive {$HIGHCHARUNICODE ON} make any difference. The directive is OFF by default. “When it is ON, the compiler parses each hexadecimal #$xx 2-digit literal as a WideChar, so there no possibility to interpret B4 as the lead byte of some character in CP936.”

Digging Deeper

It seems $41#$42 is treated as UnicodeString, and $B4#$29#$0D#$0A is AnsiString, so I did more experiments to learn about the type of the string literals.

const
  s1 = #$41#$42;          // UnicodeString
  s1 = #$41#$42#$80;      // UnicodeString (#$80 is EUR symbol in CP936)
  s1 = #$41#$42#$81;      // AnsiString  (#$81 is a lead byte in CP936 but the single byte is invalid character)
  s1 = #$41#$42#$81#$29;  // AnsiString  (invalid trail byte)
  s1 = #$41#$42#$81#$40;  // UnicodeString (valid byte pair)

(The selected DumpStr overload in Delphi IDE reveals the kind.)

// provided by Bruneau
function DumpStr(const AStr: RawByteString): string; overload;
begin
//...
end;

function DumpStr(const AStr: string): string; overload;
begin
  Result := '';
  for var I := Low(AStr) to High(AStr) do
  begin
    var ch := AStr[I];
    var s: string;
    if isprint(Integer(ch)) <> 0 then
      s := Format('''%s''', [ch])
    else 
      s := Format('$%0.2x', [Integer(ch)]);
    Result := Format('%s%-5s', [Result, s]);
  end;
end;

Conclusion

Why does data loss occur? In short, it is caused by converting an invalid AnsiString to a UnicodeString. Invalid byte sequences are replaced with the ? character. But how does this happen exactly? What’s the underlying reason?

Everything hinges on two factors: how dcc evaluates the control‑string literal and which string type it assigns to the resulting value.

1. String literal value

With HIGHCHARUNICODE OFF (the default) the two-digit hex literal is interpreted as an AnsiString/AnsiChar.
With HIGHCHARUNICODE ON the literal is interpreted as a UnicodeString/WideChar.

2. Inferred type

In explicit casting like AnsiString(expr)where the expr is a primary expression without +, the cast dictates the expression type. (or determined type in more context?)
Otherwise the compiler choose UnicodeString unless the byte sequence contains invalid characters in current code page; in that case it treat the entire literal as AnsiString (to prevent immediate data loss?).

3. The `+` operator

In an expression lhs + rhs, the resulting string type is determined by both operands as said in Delphi Language Guide:

  s1 = #$41#$42#$80 + #$B4#$29#$0D#$0A;  // UnicodeString + AnsiString -> UnicodeString (Data loss)
  s1 = #$41#$42#$81 + #$B4#$29#$0D#$0A;  // AnsiString + AnsiString -> AnsiString
  s1 = #$41#$42#$81 + #13#10;     // AnsiString + UnicodeString -> UnicodeString
  s1 = #$41#$42#$81 + 'abc';      // AnsiString + UnicodeString -> UnicodeString
  s1 = #$41#$42#$81 + 'a';        // AnsiString + WideChar -> UnicodeString
  s1 = #$41#$42#$81 + AnsiChar('a');    // AnsiString + AnsiChar -> Ansistring

Bruneau added some logs in dcc and it showed that the value types are correct.

He also pointed out, the last case is interesting, the compiler just append the ansi char byte to left operand. AnsiChar(‘a’) is #$61, the whole literal value is #$41#$42#$81#$61 and #$81#$61 pair is a valid character in CP936.

Revisiting the Original Program

Let’s look at the problem again. The code is commented:

// Compile with code page 936
program Problem;

const
  strPublicKey: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 + // UnicodeString
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$29#$A9 + // AnsiString as `#$B4#$29`, `$A9` are invalid
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$36#$B9#$23#$0C#$CA#$D4 + // AnsiString as `#$98#$36`, `#$B9#$23` are invalid
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;  // AnsiString as `#$E6#$32` is invalid

begin
  Writeln(DumpStr(strPublicKey), ' Len:', Length(strPublicKey));
  Readln;
end.

The first string literal is interpreted as a UnicodeString without any data loss. However, since the three subsequent string literals contain invalid byte sequences in CP936, they are treated as AnsiStrings. When the compiler encounters a UnicodeString + AnsiString operation, it converts the AnsiString into a UnicodeString. During this conversion, any invalid byte sequences (Normally a valid lead byte followed by an invalid or missing trail byte) are replaced with the ? character.

We incrementally replaced the invalid byte sequences (#$B4#$29, #$A9, #$98#$36, #$B9#$23, #$E6#$32) with valid values and reran DumpStr. We also tried casting the first string literal to AnsiString.

const
  s1: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$40#$A9 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$36#$B9#$23#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
  s2: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$40#$41 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$36#$B9#$23#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
  s3: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$40#$41 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$40#$B9#$23#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
  s4: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$40#$41 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$40#$B9#$40#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
  s5: RawByteString =
    #$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05 +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$40#$41 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$40#$B9#$40#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$40#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
  s6: RawByteString =
AnsiString(#$30#$3C#$30#$0D#$06#$09#$2A#$86#$48#$86#$F7#$0D#$01#$01#$01#$05) +
    #$00#$03#$2B#$00#$30#$28#$02#$21#$00#$A4#$65#$B8#$CD#$B4#$29#$A9 +
    #$64#$1A#$C5#$80#$55#$22#$1B#$BB#$C5#$98#$36#$B9#$23#$0C#$CA#$D4 +
    #$A8#$B8#$7C#$E6#$32#$E3#$89#$3D#$77#$02#$03#$01#$00#$01;
begin
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s1), Length(s1)]));
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s2), Length(s2)]));
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s3), Length(s3)]));
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s4), Length(s4)]));
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s5), Length(s5)]));
  Writeln(Format('%-35s Len:%d'#13#10, [DumpStr(s6), Length(s6)]));
  Readln;
end.

The results match the predictions we derived in our earlier analysis, confirming that our reasoning was sound. It’s reassuring—and frankly satisfying—to see the empirical output align so closely with what we anticipated.

Special thanks to Bruneau for his invaluable assistance—his replies are always a pleasure to read, and I learn something new each time.