Wokwi - Online ESP32, STM32, Arduino Simulator

/*
   This Arduino sketch demonstrates how to convert hex-coded 16-bit characters
   (typically received from a GSM module in UCS2/UTF-16BE format) to UTF-8,
   without using the String class.

   The GSM module often sends SMS messages in UCS2 (UTF-16 Big Endian) when
   special characters are present. This code parses the hex representation
   of these 16-bit characters and converts them to their UTF-8 equivalent.

   Author: [Your Name/Handle]
   Date:   July 22, 2025
   License: MIT
*/

// Function to convert a single hex character (0-9, A-F, a-f) to its integer value
byte hexToByte(char c) {
  if (c >= '0' && c <= '9') {
    return c - '0';
  } else if (c >= 'A' && c <= 'F') {
    return c - 'A' + 10;
  } else if (c >= 'a' && c <= 'f') {
    return c - 'a' + 10;
  }
  return 0; // Should not happen with valid hex input
}

// Function to convert a two-character hex string (e.g., "AB") to a byte
byte hexToByte(char highNibble, char lowNibble) {
  return (hexToByte(highNibble) << 4) | hexToByte(lowNibble);
}

/**
 * Converts a hex-coded 16-bit character (UTF-16BE) to UTF-8.
 *
 * @param hexCharHighByte The higher byte of the 16-bit character (e.g., '0' '0' for U+0020)
 * @param hexCharLowByte The lower byte of the 16-bit character (e.g., '2' '0' for U+0020)
 * @param outputBuffer A buffer to store the UTF-8 encoded characters. Must be at least 4 bytes.
 * @return The number of bytes written to the outputBuffer (1 to 4), or 0 if invalid input.
 */
int utf16BEHexToUtf8(const char* hexCharHighByte, const char* hexCharLowByte, char* outputBuffer) {
  if (!hexCharHighByte || !hexCharLowByte || !outputBuffer) {
    return 0; // Invalid input pointers
  }

  // Convert hex characters to their byte values
  uint16_t unicodeChar = (hexToByte(hexCharHighByte[0], hexCharHighByte[1]) << 8) |
                         hexToByte(hexCharLowByte[0], hexCharLowByte[1]);

  if (unicodeChar <= 0x7F) {
    // 1-byte sequence (ASCII)
    outputBuffer[0] = (char)unicodeChar;
    return 1;
  } else if (unicodeChar <= 0x7FF) {
    // 2-byte sequence
    outputBuffer[0] = (char)(0xC0 | (unicodeChar >> 6));
    outputBuffer[1] = (char)(0x80 | (unicodeChar & 0x3F));
    return 2;
  } else if (unicodeChar <= 0xFFFF) {
    // 3-byte sequence
    outputBuffer[0] = (char)(0xE0 | (unicodeChar >> 12));
    outputBuffer[1] = (char)(0x80 | ((unicodeChar >> 6) & 0x3F));
    outputBuffer[2] = (char)(0x80 | (unicodeChar & 0x3F));
    return 3;
  }
  // For characters beyond U+FFFF (4-byte UTF-8), which are less common
  // in typical GSM module SMS messages (as they are usually UCS2/UTF-16BE)
  // you would need to handle surrogate pairs or extended Unicode planes.
  // For simplicity, this example focuses on the common UCS2 range.
  return 0; // Should not happen for valid UCS2 characters within the 3-byte range
}

void setup() {
  Serial.begin(115200);
  while (!Serial) {
    ; // Wait for Serial port to connect. Needed for native USB port only
  }
  Serial.println("UTF-16BE Hex to UTF-8 Converter (Arduino)");
  Serial.println("----------------------------------------");

  // Example 1: ASCII character (e.g., space ' ') - U+0020
  // Hex: 0020
  char utf8Buffer1[4]; // Max 3 bytes for this range + null terminator
  int bytesWritten1 = utf16BEHexToUtf8("00", "20", utf8Buffer1);
  if (bytesWritten1 > 0) {
    utf8Buffer1[bytesWritten1] = '\0'; // Null-terminate
    Serial.print("Hex: 0020 -> UTF-8: ");
    Serial.println(utf8Buffer1);
  }

  // Example 2: Latin Extended-A character (e.g., '€' Euro sign) - U+20AC
  // Hex: 20AC
  char utf8Buffer2[4]; // Max 3 bytes for this range + null terminator
  int bytesWritten2 = utf16BEHexToUtf8("20", "AC", utf8Buffer2);
  if (bytesWritten2 > 0) {
    utf8Buffer2[bytesWritten2] = '\0'; // Null-terminate
    Serial.print("Hex: 20AC -> UTF-8: ");
    Serial.println(utf8Buffer2);
  }

  // Example 3: Greek capital letter Omega 'Ω' - U+03A9
  // Hex: 03A9
  char utf8Buffer3[4]; // Max 3 bytes for this range + null terminator
  int bytesWritten3 = utf16BEHexToUtf8("03", "A9", utf8Buffer3);
  if (bytesWritten3 > 0) {
    utf8Buffer3[bytesWritten3] = '\0'; // Null-terminate
    Serial.print("Hex: 03A9 -> UTF-8: ");
    Serial.println(utf8Buffer3);
  }

  // Example 4: A character that maps to a 1-byte UTF-8 sequence (e.g., 'A') - U+0041
  // Hex: 0041
  char utf8Buffer4[4];
  int bytesWritten4 = utf16BEHexToUtf8("00", "41", utf8Buffer4);
  if (bytesWritten4 > 0) {
    utf8Buffer4[bytesWritten4] = '\0';
    Serial.print("Hex: 0041 -> UTF-8: ");
    Serial.println(utf8Buffer4);
  }

  // Example 5: A character that maps to a 2-byte UTF-8 sequence (e.g., '¢' Cent sign) - U+00A2
  // Hex: 00A2
  char utf8Buffer5[4];
  int bytesWritten5 = utf16BEHexToUtf8("00", "A2", utf8Buffer5);
  if (bytesWritten5 > 0) {
    utf8Buffer5[bytesWritten5] = '\0';
    Serial.print("Hex: 00A2 -> UTF-8: ");
    Serial.println(utf8Buffer5);
  }

  // Example of processing a complete hex-encoded message
  Serial.println("\nProcessing a sample hex-encoded message:");
  // Sample message: "Hello €Ω!" in UCS2 hex (00480065006C006C006F002020AC03A90021)
  const char* hexMessage = "201E0073006C0065201300700069010D2014006B0061201C";
  char utf8OutputBuffer[100]; // Buffer to hold the entire converted message
  int outputIndex = 0;

  Serial.print("Hex Message: ");
  Serial.println(hexMessage);
  Serial.print("UTF-8 Message: ");

  for (int i = 0; hexMessage[i] != '\0' && hexMessage[i+1] != '\0' && hexMessage[i+2] != '\0' && hexMessage[i+3] != '\0'; i += 4) {
    char hexHighByte[3] = {hexMessage[i], hexMessage[i+1], '\0'};
    char hexLowByte[3] = {hexMessage[i+2], hexMessage[i+3], '\0'};
    char tempUtf8[4];
    int bytes = utf16BEHexToUtf8(hexHighByte, hexLowByte, tempUtf8);
    if (bytes > 0) {
      for (int j = 0; j < bytes; ++j) {
        if (outputIndex < sizeof(utf8OutputBuffer) - 1) { // Prevent buffer overflow
          utf8OutputBuffer[outputIndex++] = tempUtf8[j];
        } else {
          Serial.println("\nWarning: Output buffer full!");
          break;
        }
      }
    }
  }
  utf8OutputBuffer[outputIndex] = '\0'; // Null-terminate the final string
  Serial.println(utf8OutputBuffer);
}

void loop() {
  // Nothing to do in loop for this example
}