C/C++ handling of escaped \uxxxx characters

This article draws on the article "The C++ version of UnEscape parses \uxxxx\uxxxx encoded characters" (link: https://www.cnblogs.com/guolongzheng/p/9375956.html ) in "The night is deep and the tea is cold".

There is an unfixed bug in the original text: non-escaped ordinary strings, such as "2018" and "V" in the string, are not output after being processed by the `UnEscape()` function.

"input" string: "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\ \u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63"

Expected output: "Maternity wear summer dress 2018 new loose short-sleeved mid-length top chiffon V-neck bowknot dress summer"

Actual output: "Maternity wear summer new loose short-sleeved mid-length top chiffon collar bowknot dress summer"

I made corrections based on the original author's code, and at the same time made a C code for your reference, please give me your advice.

C++:

#include <iostream>
#include <string>
#include <codecvt>
#include <locale>
#include <windows.h>

using namespace std;

string Unescape(const string& input) {
    wstring wresult;
    for (size_t i = 0; i < input.length(); ) {
        if (input[i] == '\\' && input[i + 1] == 'u') {
            string code = input.substr(i + 2, 4);
            wchar_t unicode = stoi(code, nullptr, 16);
            wresult += unicode;
            i += 6;
        } else {
            wresult += input[i++];
        }
    }
    wstring_convert<codecvt_utf8<wchar_t>> conv;
    string result = conv.to_bytes(wresult);
    return result;
}

int main() {
    if (GetConsoleOutputCP() != CP_UTF8)
        SetConsoleOutputCP(CP_UTF8);
    string input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63";
    string result = Unescape(input);
    if (!result.empty()) {
        cout << "原转义字符串:"<< input << "\n\n转义还原后的结果:" << result << endl;
    }
    return 0;
}

C:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <windows.h>

char* Unescape(char* input) {
    size_t inputLen = strlen(input);
    wchar_t* wresult = malloc((inputLen + 1) * sizeof(wchar_t));
    if (wresult == NULL) {
        return NULL;
    }
    
    wchar_t* wr = wresult;
    char* p = input;
    while (*p) {
        if (*p == '\\' && *(p + 1) == 'u') {
            char code[5] = {0};
            strncpy(code, p + 2, 4);
            int unicode = strtol(code, NULL, 16);
            *wr++ = unicode;
            p += 6;
        } else {
            *wr++ = *p++;
        }
    }
    *wr = L'\0';
    
    int len = WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL);
    char* result = malloc(len + 1);
    if (result != NULL) {
        WideCharToMultiByte(CP_UTF8, 0, wresult, -1, result, len + 1, NULL, NULL);
    }
    free(wresult);
    return result;
}

int main() {
    char *input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63";
    char* result = Unescape(input);
    
    if (GetConsoleOutputCP() != CP_UTF8)
        SetConsoleOutputCP(CP_UTF8);
    
    if (result != NULL) {
        printf("原转义字符串:%s\n\n转义还原后的结果:%s\n", input, result);
        free(result);
    }
    return 0;
}

In the Windows command prompt, the code page is set to 936 (GB-2321) or 65001 (UTF-8) and the operation is correct:

 

Guess you like

Origin blog.csdn.net/Scott0902/article/details/131309384