This library provides UCS2 to UFT8 or vise versa functions to manipulate UCS2/UTF8 strings. This library is currently used by edk2 port of open source jansson library. Signed-off-by: Abner Chang <abner.chang@hpe.com> Cc: Liming Gao <gaoliming@byosoft.com.cn> Cc: Leif Lindholm <leif@nuviainc.com> Cc: Nickle Wang <nickle.wang@hpe.com> Cc: Peter O'Hanley <peter.ohanley@hpe.com> Reviewed-by: Nickle Wang <nickle.wang@hpe.com> Acked-by: Leif Lindholm <leif@nuviainc.com> Reviewed-by: Michael D Kinney <michael.d.kinney@intel.com>
		
			
				
	
	
		
			422 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			422 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/** @file
 | 
						|
  UCS2 to UTF8 manipulation library.
 | 
						|
 | 
						|
  Copyright (c) 2018 - 2019, Intel Corporation. All rights reserved.<BR>
 | 
						|
  (C) Copyright 2020 Hewlett Packard Enterprise Development LP<BR>
 | 
						|
 | 
						|
    SPDX-License-Identifier: BSD-2-Clause-Patent
 | 
						|
 | 
						|
**/
 | 
						|
#include <Uefi.h>
 | 
						|
#include <Library/BaseLib.h>
 | 
						|
#include <Library/BaseMemoryLib.h>
 | 
						|
#include <Library/BaseUcs2Utf8Lib.h>
 | 
						|
#include <Library/DebugLib.h>
 | 
						|
#include <Library/MemoryAllocationLib.h>
 | 
						|
 | 
						|
/**
 | 
						|
  Since each UCS2 character can be represented by 1-3 UTF8 encoded characters,
 | 
						|
  this function is used to retrieve the UTF8 encoding size for a UCS2 character.
 | 
						|
 | 
						|
  @param[in]   Utf8Buffer       The buffer for UTF8 encoded data.
 | 
						|
 | 
						|
  @retval      Return the size of UTF8 encoding string or 0 if it is not for
 | 
						|
               UCS2 format.
 | 
						|
 | 
						|
**/
 | 
						|
UINT8
 | 
						|
GetUTF8SizeForUCS2 (
 | 
						|
  IN    CHAR8      *Utf8Buffer
 | 
						|
  )
 | 
						|
{
 | 
						|
  CHAR8    TempChar;
 | 
						|
  UINT8    Utf8Size;
 | 
						|
 | 
						|
  ASSERT (Utf8Buffer != NULL);
 | 
						|
 | 
						|
  TempChar = *Utf8Buffer;
 | 
						|
  if ((TempChar & 0xF0) == 0xF0) {
 | 
						|
 | 
						|
    //
 | 
						|
    // This format is not for UCS2.
 | 
						|
    //
 | 
						|
    return 0;
 | 
						|
  }
 | 
						|
 | 
						|
  Utf8Size = 1;
 | 
						|
  if ((TempChar & 0x80) == 0x80) {
 | 
						|
    if ((TempChar & 0xC0) == 0xC0) {
 | 
						|
 | 
						|
      Utf8Size ++;
 | 
						|
      if ((TempChar & 0xE0) == 0xE0) {
 | 
						|
 | 
						|
        Utf8Size ++;
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  return Utf8Size;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
  Since each UCS2 character can be represented by the format: \uXXXX, this function
 | 
						|
  is used to retrieve the UCS2 character from a Unicode format.
 | 
						|
  Call MUST make sure there are at least 6 Bytes in the input UTF8 buffer.
 | 
						|
 | 
						|
  @param[in]    Utf8Buffer             The buffer for UTF8 encoded data.
 | 
						|
  @param[out]   Ucs2Char               The converted UCS2 character.
 | 
						|
 | 
						|
  @retval       EFI_INVALID_PARAMETER  Non-Ascii characters found in the hexadecimal
 | 
						|
                                       digits string, and can't be converted to a UCS2
 | 
						|
                                       character.
 | 
						|
  @retval       EFI_SUCCESS            The UCS2 character has been retrieved.
 | 
						|
 | 
						|
**/
 | 
						|
EFI_STATUS
 | 
						|
GetUCS2CharByFormat (
 | 
						|
  IN    CHAR8      *Utf8Buffer,
 | 
						|
  OUT   CHAR16     *Ucs2Char
 | 
						|
  )
 | 
						|
{
 | 
						|
  UINT8     Num1;
 | 
						|
  UINT8     Num2;
 | 
						|
  UINT8     Index;
 | 
						|
  CHAR8     Ucs2CharFormat[UNICODE_FORMAT_CHAR_SIZE];  /// two Hexadecimal digits Ascii string, like "3F"
 | 
						|
 | 
						|
  for (Index = 0; Index < 4; Index ++) {
 | 
						|
    if ((*(Utf8Buffer + 2 + Index) & 0x80) != 0x00) {
 | 
						|
      return EFI_INVALID_PARAMETER;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  ZeroMem (Ucs2CharFormat, UNICODE_FORMAT_CHAR_SIZE);
 | 
						|
 | 
						|
  //
 | 
						|
  // Get the First Number, Offset is 2
 | 
						|
  //
 | 
						|
  CopyMem (Ucs2CharFormat, Utf8Buffer + 2, UNICODE_FORMAT_CHAR_LEN);
 | 
						|
  Num1 = (UINT8) AsciiStrHexToUintn (Ucs2CharFormat);
 | 
						|
 | 
						|
  //
 | 
						|
  // Get the Second Number, Offset is 4
 | 
						|
  //
 | 
						|
  CopyMem (Ucs2CharFormat, Utf8Buffer + 4, UNICODE_FORMAT_CHAR_LEN);
 | 
						|
  Num2 = (UINT8) AsciiStrHexToUintn (Ucs2CharFormat);
 | 
						|
 | 
						|
  //
 | 
						|
  // Ucs2Char is Little-Endian
 | 
						|
  //
 | 
						|
  *((CHAR8 *) Ucs2Char)        = Num2;
 | 
						|
  *(((CHAR8 *) Ucs2Char) + 1) = Num1;
 | 
						|
 | 
						|
  return EFI_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
  Convert a UCS2 character to UTF8 encoding string.
 | 
						|
 | 
						|
  @param[in]    Ucs2Char               The provided UCS2 character.
 | 
						|
  @param[out]   Utf8Buffer             The converted UTF8 encoded data.
 | 
						|
 | 
						|
  @retval      Return the size of UTF8 encoding data for this UCS2 character.
 | 
						|
 | 
						|
**/
 | 
						|
UINT8
 | 
						|
UCS2CharToUTF8 (
 | 
						|
  IN  CHAR16     Ucs2Char,
 | 
						|
  OUT CHAR8      *Utf8Buffer
 | 
						|
  )
 | 
						|
{
 | 
						|
  UINT16    Ucs2Number;
 | 
						|
 | 
						|
  ASSERT (Utf8Buffer != NULL);
 | 
						|
 | 
						|
  Ucs2Number = (UINT16) Ucs2Char;
 | 
						|
  if (Ucs2Number <= 0x007F) {
 | 
						|
 | 
						|
    //
 | 
						|
    // UTF8 format: 0xxxxxxx
 | 
						|
    //
 | 
						|
    *Utf8Buffer = Ucs2Char & 0x7F;
 | 
						|
    return 1;
 | 
						|
 | 
						|
  } else if (Ucs2Number >= 0x0080 && Ucs2Number <= 0x07FF) {
 | 
						|
 | 
						|
    //
 | 
						|
    // UTF8 format: 110xxxxx 10xxxxxx
 | 
						|
    //
 | 
						|
    *(Utf8Buffer + 1) = (Ucs2Char & 0x3F) | 0x80;
 | 
						|
    *Utf8Buffer       = ((Ucs2Char >> 6) & 0x1F) | 0xC0;
 | 
						|
    return 2;
 | 
						|
 | 
						|
  } else {  /// Ucs2Number >= 0x0800 && Ucs2Number <= 0xFFFF
 | 
						|
 | 
						|
    //
 | 
						|
    // UTF8 format: 1110xxxx 10xxxxxx 10xxxxxx
 | 
						|
    //
 | 
						|
    *(Utf8Buffer + 2) = (Ucs2Char & 0x3F) | 0x80;
 | 
						|
    *(Utf8Buffer + 1) = ((Ucs2Char >> 6) & 0x3F) | 0x80;
 | 
						|
    *Utf8Buffer       = ((Ucs2Char >> 12) & 0x0F) | 0xE0;
 | 
						|
    return 3;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
  Convert a UTF8 encoded data to a UCS2 character.
 | 
						|
 | 
						|
  @param[in]    Utf8Buffer             The provided UTF8 encoded data.
 | 
						|
  @param[out]   Ucs2Char               The converted UCS2 character.
 | 
						|
 | 
						|
  @retval       EFI_INVALID_PARAMETER  The UTF8 encoded string is not valid or
 | 
						|
                                       not for UCS2 character.
 | 
						|
  @retval       EFI_SUCCESS            The converted UCS2 character.
 | 
						|
 | 
						|
**/
 | 
						|
EFI_STATUS
 | 
						|
UTF8ToUCS2Char (
 | 
						|
  IN   CHAR8      *Utf8Buffer,
 | 
						|
  OUT  CHAR16     *Ucs2Char
 | 
						|
  )
 | 
						|
{
 | 
						|
  UINT8    Utf8Size;
 | 
						|
  CHAR8    *Ucs2Buffer;
 | 
						|
  CHAR8    TempChar1;
 | 
						|
  CHAR8    TempChar2;
 | 
						|
  CHAR8    TempChar3;
 | 
						|
 | 
						|
  ASSERT (Utf8Buffer != NULL && Ucs2Char != NULL);
 | 
						|
  ZeroMem (Ucs2Char, sizeof (CHAR16));
 | 
						|
  Ucs2Buffer = (CHAR8 *) Ucs2Char;
 | 
						|
 | 
						|
  Utf8Size = GetUTF8SizeForUCS2 (Utf8Buffer);
 | 
						|
  switch (Utf8Size) {
 | 
						|
 | 
						|
    case 1:
 | 
						|
 | 
						|
      //
 | 
						|
      // UTF8 format: 0xxxxxxx
 | 
						|
      //
 | 
						|
      TempChar1 = *Utf8Buffer;
 | 
						|
      if ((TempChar1 & 0x80) != 0x00) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      *Ucs2Buffer       = TempChar1;
 | 
						|
      *(Ucs2Buffer + 1) = 0;
 | 
						|
      break;
 | 
						|
 | 
						|
    case 2:
 | 
						|
 | 
						|
      //
 | 
						|
      // UTF8 format: 110xxxxx 10xxxxxx
 | 
						|
      //
 | 
						|
      TempChar1 = *Utf8Buffer;
 | 
						|
      if ((TempChar1 & 0xE0) != 0xC0) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      TempChar2 = *(Utf8Buffer + 1);
 | 
						|
      if ((TempChar2 & 0xC0) != 0x80) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      *Ucs2Buffer       = (TempChar1 << 6) + (TempChar2 & 0x3F);
 | 
						|
      *(Ucs2Buffer + 1) = (TempChar1 >> 2) & 0x07;
 | 
						|
      break;
 | 
						|
 | 
						|
    case 3:
 | 
						|
 | 
						|
      //
 | 
						|
      // UTF8 format: 1110xxxx 10xxxxxx 10xxxxxx
 | 
						|
      //
 | 
						|
      TempChar1 = *Utf8Buffer;
 | 
						|
      if ((TempChar1 & 0xF0) != 0xE0) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      TempChar2 = *(Utf8Buffer + 1);
 | 
						|
      if ((TempChar2 & 0xC0) != 0x80) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      TempChar3 = *(Utf8Buffer + 2);
 | 
						|
      if ((TempChar3 & 0xC0) != 0x80) {
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      *Ucs2Buffer       = (TempChar2 << 6) + (TempChar3 & 0x3F);
 | 
						|
      *(Ucs2Buffer + 1) = (TempChar1 << 4) + ((TempChar2 >> 2) & 0x0F);
 | 
						|
 | 
						|
      break;
 | 
						|
 | 
						|
    default:
 | 
						|
 | 
						|
      return EFI_INVALID_PARAMETER;
 | 
						|
  }
 | 
						|
 | 
						|
  return EFI_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
  Convert a UCS2 string to a UTF8 encoded string.
 | 
						|
 | 
						|
  @param[in]    Ucs2Str                The provided UCS2 string.
 | 
						|
  @param[out]   Utf8StrAddr            The converted UTF8 string address. Caller
 | 
						|
                                       is responsible for Free this string.
 | 
						|
 | 
						|
  @retval       EFI_INVALID_PARAMETER  One or more parameters are invalid.
 | 
						|
  @retval       EFI_OUT_OF_RESOURCES   System runs out of resources.
 | 
						|
  @retval       EFI_SUCCESS            The UTF8 encoded string has been converted.
 | 
						|
 | 
						|
**/
 | 
						|
EFI_STATUS
 | 
						|
UCS2StrToUTF8 (
 | 
						|
  IN  CHAR16     *Ucs2Str,
 | 
						|
  OUT CHAR8      **Utf8StrAddr
 | 
						|
  )
 | 
						|
{
 | 
						|
  UINTN    Ucs2StrIndex;
 | 
						|
  UINTN    Ucs2StrLength;
 | 
						|
  CHAR8    *Utf8Str;
 | 
						|
  UINTN    Utf8StrLength;
 | 
						|
  UINTN    Utf8StrIndex;
 | 
						|
  CHAR8    Utf8Buffer[UTF8_BUFFER_FOR_UCS2_MAX_SIZE];
 | 
						|
  UINT8    Utf8BufferSize;
 | 
						|
 | 
						|
  if (Ucs2Str == NULL || Utf8StrAddr == NULL) {
 | 
						|
    return EFI_INVALID_PARAMETER;
 | 
						|
  }
 | 
						|
 | 
						|
  Ucs2StrLength = StrLen (Ucs2Str);
 | 
						|
  Utf8StrLength = 0;
 | 
						|
 | 
						|
  for (Ucs2StrIndex = 0; Ucs2StrIndex < Ucs2StrLength; Ucs2StrIndex ++) {
 | 
						|
 | 
						|
    ZeroMem (Utf8Buffer, sizeof (Utf8Buffer));
 | 
						|
    Utf8BufferSize = UCS2CharToUTF8 (Ucs2Str[Ucs2StrIndex], Utf8Buffer);
 | 
						|
    Utf8StrLength += Utf8BufferSize;
 | 
						|
  }
 | 
						|
 | 
						|
  Utf8Str = AllocateZeroPool (Utf8StrLength + 1);
 | 
						|
  if (Utf8Str == NULL) {
 | 
						|
    return EFI_OUT_OF_RESOURCES;
 | 
						|
  }
 | 
						|
 | 
						|
  Utf8StrIndex = 0;
 | 
						|
  for (Ucs2StrIndex = 0; Ucs2StrIndex < Ucs2StrLength; Ucs2StrIndex ++) {
 | 
						|
 | 
						|
    ZeroMem (Utf8Buffer, sizeof (Utf8Buffer));
 | 
						|
    Utf8BufferSize = UCS2CharToUTF8 (Ucs2Str[Ucs2StrIndex], Utf8Buffer);
 | 
						|
 | 
						|
    CopyMem (Utf8Str + Utf8StrIndex, Utf8Buffer, Utf8BufferSize);
 | 
						|
    Utf8StrIndex += Utf8BufferSize;
 | 
						|
  }
 | 
						|
 | 
						|
  Utf8Str[Utf8StrIndex] = '\0';
 | 
						|
  *Utf8StrAddr = Utf8Str;
 | 
						|
 | 
						|
  return EFI_SUCCESS;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
  Convert a UTF8 encoded string to a UCS2 string.
 | 
						|
 | 
						|
  @param[in]    Utf8Str                The provided UTF8 encoded string.
 | 
						|
  @param[out]   Ucs2StrAddr            The converted UCS2 string address. Caller
 | 
						|
                                       is responsible for Free this string.
 | 
						|
 | 
						|
  @retval       EFI_INVALID_PARAMETER  The UTF8 encoded string is not valid to
 | 
						|
                                       convert to UCS2 string.
 | 
						|
                                       One or more parameters are invalid.
 | 
						|
  @retval       EFI_OUT_OF_RESOURCES   System runs out of resources.
 | 
						|
  @retval       EFI_SUCCESS            The UCS2 string has been converted.
 | 
						|
 | 
						|
**/
 | 
						|
EFI_STATUS
 | 
						|
UTF8StrToUCS2 (
 | 
						|
  IN  CHAR8      *Utf8Str,
 | 
						|
  OUT CHAR16     **Ucs2StrAddr
 | 
						|
  )
 | 
						|
{
 | 
						|
  EFI_STATUS    Status;
 | 
						|
  UINTN         Utf8StrIndex;
 | 
						|
  UINTN         Utf8StrLength;
 | 
						|
  UINTN         Ucs2StrIndex;
 | 
						|
  UINT8         Utf8BufferSize;
 | 
						|
  CHAR16        *Ucs2StrTemp;
 | 
						|
 | 
						|
  if (Utf8Str == NULL || Ucs2StrAddr == NULL) {
 | 
						|
    return EFI_INVALID_PARAMETER;
 | 
						|
  }
 | 
						|
 | 
						|
  //
 | 
						|
  // It is not an Ascii string, calculate string length.
 | 
						|
  //
 | 
						|
  Utf8StrLength = 0;
 | 
						|
  while (*(Utf8Str + Utf8StrLength) != '\0') {
 | 
						|
    Utf8StrLength ++;
 | 
						|
  }
 | 
						|
 | 
						|
  //
 | 
						|
  // UCS2 string shall not be longer than the UTF8 string.
 | 
						|
  //
 | 
						|
  Ucs2StrTemp = AllocateZeroPool ((Utf8StrLength + 1) * sizeof (CHAR16));
 | 
						|
  if (Ucs2StrTemp == NULL) {
 | 
						|
    return EFI_OUT_OF_RESOURCES;
 | 
						|
  }
 | 
						|
 | 
						|
  Utf8StrIndex = 0;
 | 
						|
  Ucs2StrIndex = 0;
 | 
						|
  while (Utf8Str[Utf8StrIndex] != '\0') {
 | 
						|
 | 
						|
    if (CompareMem (Utf8Str + Utf8StrIndex, "\\u", 2) == 0 &&
 | 
						|
      Utf8StrLength - Utf8StrIndex >= UNICODE_FORMAT_LEN) {
 | 
						|
 | 
						|
      Status = GetUCS2CharByFormat (Utf8Str + Utf8StrIndex, Ucs2StrTemp + Ucs2StrIndex);
 | 
						|
      if (!EFI_ERROR (Status)) {
 | 
						|
 | 
						|
        Utf8StrIndex += UNICODE_FORMAT_LEN;
 | 
						|
        Ucs2StrIndex ++;
 | 
						|
      } else {
 | 
						|
 | 
						|
        StrCpyS (Ucs2StrTemp + Ucs2StrIndex, 3, L"\\u");
 | 
						|
 | 
						|
        Ucs2StrIndex += 2;
 | 
						|
        Utf8StrIndex += 2;
 | 
						|
      }
 | 
						|
    } else {
 | 
						|
 | 
						|
      Utf8BufferSize = GetUTF8SizeForUCS2 (Utf8Str + Utf8StrIndex);
 | 
						|
      if (Utf8BufferSize == 0 || Utf8StrLength - Utf8StrIndex < Utf8BufferSize) {
 | 
						|
 | 
						|
        FreePool (Ucs2StrTemp);
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      Status = UTF8ToUCS2Char (Utf8Str + Utf8StrIndex, Ucs2StrTemp + Ucs2StrIndex);
 | 
						|
      if (EFI_ERROR (Status)) {
 | 
						|
 | 
						|
        FreePool (Ucs2StrTemp);
 | 
						|
        return EFI_INVALID_PARAMETER;
 | 
						|
      }
 | 
						|
 | 
						|
      Ucs2StrIndex ++;
 | 
						|
      Utf8StrIndex += Utf8BufferSize;
 | 
						|
    }
 | 
						|
  }
 | 
						|
 | 
						|
  *Ucs2StrAddr = AllocateZeroPool ((Ucs2StrIndex + 1) * sizeof (CHAR16));
 | 
						|
  if (*Ucs2StrAddr == NULL) {
 | 
						|
 | 
						|
    FreePool (Ucs2StrTemp);
 | 
						|
    return EFI_OUT_OF_RESOURCES;
 | 
						|
  }
 | 
						|
 | 
						|
  StrCpyS (*Ucs2StrAddr, Ucs2StrIndex + 1, Ucs2StrTemp);
 | 
						|
  *(*Ucs2StrAddr + Ucs2StrIndex) = L'\0';
 | 
						|
  FreePool (Ucs2StrTemp);
 | 
						|
 | 
						|
  return EFI_SUCCESS;
 | 
						|
}
 | 
						|
 |