/*
This Arduino sketch demonstrates how to convert hex-coded 16-bit characters
(typically received from a GSM module in UCS2/UTF-16BE format) to UTF-8,
without using the String class.
The GSM module often sends SMS messages in UCS2 (UTF-16 Big Endian) when
special characters are present. This code parses the hex representation
of these 16-bit characters and converts them to their UTF-8 equivalent.
Author: [Your Name/Handle]
Date: July 22, 2025
License: MIT
*/
// Function to convert a single hex character (0-9, A-F, a-f) to its integer value
byte hexToByte(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
} else if (c >= 'a' && c <= 'f') {
return c - 'a' + 10;
}
return 0; // Should not happen with valid hex input
}
// Function to convert a two-character hex string (e.g., "AB") to a byte
byte hexToByte(char highNibble, char lowNibble) {
return (hexToByte(highNibble) << 4) | hexToByte(lowNibble);
}
/**
* Converts a hex-coded 16-bit character (UTF-16BE) to UTF-8.
*
* @param hexCharHighByte The higher byte of the 16-bit character (e.g., '0' '0' for U+0020)
* @param hexCharLowByte The lower byte of the 16-bit character (e.g., '2' '0' for U+0020)
* @param outputBuffer A buffer to store the UTF-8 encoded characters. Must be at least 4 bytes.
* @return The number of bytes written to the outputBuffer (1 to 4), or 0 if invalid input.
*/
int utf16BEHexToUtf8(const char* hexCharHighByte, const char* hexCharLowByte, char* outputBuffer) {
if (!hexCharHighByte || !hexCharLowByte || !outputBuffer) {
return 0; // Invalid input pointers
}
// Convert hex characters to their byte values
uint16_t unicodeChar = (hexToByte(hexCharHighByte[0], hexCharHighByte[1]) << 8) |
hexToByte(hexCharLowByte[0], hexCharLowByte[1]);
if (unicodeChar <= 0x7F) {
// 1-byte sequence (ASCII)
outputBuffer[0] = (char)unicodeChar;
return 1;
} else if (unicodeChar <= 0x7FF) {
// 2-byte sequence
outputBuffer[0] = (char)(0xC0 | (unicodeChar >> 6));
outputBuffer[1] = (char)(0x80 | (unicodeChar & 0x3F));
return 2;
} else if (unicodeChar <= 0xFFFF) {
// 3-byte sequence
outputBuffer[0] = (char)(0xE0 | (unicodeChar >> 12));
outputBuffer[1] = (char)(0x80 | ((unicodeChar >> 6) & 0x3F));
outputBuffer[2] = (char)(0x80 | (unicodeChar & 0x3F));
return 3;
}
// For characters beyond U+FFFF (4-byte UTF-8), which are less common
// in typical GSM module SMS messages (as they are usually UCS2/UTF-16BE)
// you would need to handle surrogate pairs or extended Unicode planes.
// For simplicity, this example focuses on the common UCS2 range.
return 0; // Should not happen for valid UCS2 characters within the 3-byte range
}
void setup() {
Serial.begin(9600);
while (!Serial) {
; // Wait for Serial port to connect. Needed for native USB port only
}
Serial.println("UTF-16BE Hex to UTF-8 Converter (Arduino)");
Serial.println("----------------------------------------");
// Example 1: ASCII character (e.g., space ' ') - U+0020
// Hex: 0020
char utf8Buffer1[4]; // Max 3 bytes for this range + null terminator
int bytesWritten1 = utf16BEHexToUtf8("00", "20", utf8Buffer1);
if (bytesWritten1 > 0) {
utf8Buffer1[bytesWritten1] = '\0'; // Null-terminate
Serial.print("Hex: 0020 -> UTF-8: ");
Serial.println(utf8Buffer1);
}
// Example 2: Latin Extended-A character (e.g., '€' Euro sign) - U+20AC
// Hex: 20AC
char utf8Buffer2[4]; // Max 3 bytes for this range + null terminator
int bytesWritten2 = utf16BEHexToUtf8("20", "AC", utf8Buffer2);
if (bytesWritten2 > 0) {
utf8Buffer2[bytesWritten2] = '\0'; // Null-terminate
Serial.print("Hex: 20AC -> UTF-8: ");
Serial.println(utf8Buffer2);
}
// Example 3: Greek capital letter Omega 'Ω' - U+03A9
// Hex: 03A9
char utf8Buffer3[4]; // Max 3 bytes for this range + null terminator
int bytesWritten3 = utf16BEHexToUtf8("03", "A9", utf8Buffer3);
if (bytesWritten3 > 0) {
utf8Buffer3[bytesWritten3] = '\0'; // Null-terminate
Serial.print("Hex: 03A9 -> UTF-8: ");
Serial.println(utf8Buffer3);
}
// Example 4: A character that maps to a 1-byte UTF-8 sequence (e.g., 'A') - U+0041
// Hex: 0041
char utf8Buffer4[4];
int bytesWritten4 = utf16BEHexToUtf8("00", "41", utf8Buffer4);
if (bytesWritten4 > 0) {
utf8Buffer4[bytesWritten4] = '\0';
Serial.print("Hex: 0041 -> UTF-8: ");
Serial.println(utf8Buffer4);
}
// Example 5: A character that maps to a 2-byte UTF-8 sequence (e.g., '¢' Cent sign) - U+00A2
// Hex: 00A2
char utf8Buffer5[4];
int bytesWritten5 = utf16BEHexToUtf8("00", "A2", utf8Buffer5);
if (bytesWritten5 > 0) {
utf8Buffer5[bytesWritten5] = '\0';
Serial.print("Hex: 00A2 -> UTF-8: ");
Serial.println(utf8Buffer5);
}
// Example of processing a complete hex-encoded message
Serial.println("\nProcessing a sample hex-encoded message:");
// Sample message: "Hello €Ω!" in UCS2 hex (00480065006C006C006F002020AC03A90021)
const char* hexMessage = "00480065006C006C006F002020AC03A90021";
char utf8OutputBuffer[100]; // Buffer to hold the entire converted message
int outputIndex = 0;
Serial.print("Hex Message: ");
Serial.println(hexMessage);
Serial.print("UTF-8 Message: ");
for (int i = 0; hexMessage[i] != '\0' && hexMessage[i+1] != '\0' && hexMessage[i+2] != '\0' && hexMessage[i+3] != '\0'; i += 4) {
char hexHighByte[3] = {hexMessage[i], hexMessage[i+1], '\0'};
char hexLowByte[3] = {hexMessage[i+2], hexMessage[i+3], '\0'};
char tempUtf8[4];
int bytes = utf16BEHexToUtf8(hexHighByte, hexLowByte, tempUtf8);
if (bytes > 0) {
for (int j = 0; j < bytes; ++j) {
if (outputIndex < sizeof(utf8OutputBuffer) - 1) { // Prevent buffer overflow
utf8OutputBuffer[outputIndex++] = tempUtf8[j];
} else {
Serial.println("\nWarning: Output buffer full!");
break;
}
}
}
}
utf8OutputBuffer[outputIndex] = '\0'; // Null-terminate the final string
Serial.println(utf8OutputBuffer);
}
void loop() {
// Nothing to do in loop for this example
}