This article draws on the article "The C++ version of UnEscape parses \uxxxx\uxxxx encoded characters" (link: https://www.cnblogs.com/guolongzheng/p/9375956.html ) in "The night is deep and the tea is cold".
There is an unfixed bug in the original text: non-escaped ordinary strings, such as "2018" and "V" in the string, are not output after being processed by the `UnEscape()` function.
"input" string: "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\ \u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63"
Expected output: "Maternity wear summer dress 2018 new loose short-sleeved mid-length top chiffon V-neck bowknot dress summer"
Actual output: "Maternity wear summer new loose short-sleeved mid-length top chiffon collar bowknot dress summer"
I made corrections based on the original author's code, and at the same time made a C code for your reference, please give me your advice.
C++:
#include <iostream>
#include <string>
#include <codecvt>
#include <locale>
#include <windows.h>
using namespace std;
string Unescape(const string& input) {
wstring wresult;
for (size_t i = 0; i < input.length(); ) {
if (input[i] == '\\' && input[i + 1] == 'u') {
string code = input.substr(i + 2, 4);
wchar_t unicode = stoi(code, nullptr, 16);
wresult += unicode;
i += 6;
} else {
wresult += input[i++];
}
}
wstring_convert<codecvt_utf8<wchar_t>> conv;
string result = conv.to_bytes(wresult);
return result;
}
int main() {
if (GetConsoleOutputCP() != CP_UTF8)
SetConsoleOutputCP(CP_UTF8);
string input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63";
string result = Unescape(input);
if (!result.empty()) {
cout << "原转义字符串:"<< input << "\n\n转义还原后的结果:" << result << endl;
}
return 0;
}
C:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <wchar.h> #include <windows.h> char* Unescape(char* input) { size_t inputLen = strlen(input); wchar_t* wresult = malloc((inputLen + 1) * sizeof(wchar_t)); if (wresult == NULL) { return NULL; } wchar_t* wr = wresult; char* p = input; while (*p) { if (*p == '\\' && *(p + 1) == 'u') { char code[5] = {0}; strncpy(code, p + 2, 4); int unicode = strtol(code, NULL, 16); *wr++ = unicode; p += 6; } else { *wr++ = *p++; } } *wr = L'\0'; int len = WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL); char* result = malloc(len + 1); if (result != NULL) { WideCharToMultiByte(CP_UTF8, 0, wresult, -1, result, len + 1, NULL, NULL); } free(wresult); return result; } int main() { char *input = "\\u5b55\\u5987\\u88c5\\u590f\\u88c52018\\u65b0\\u6b3e\\u5bbd\\u677e\\u77ed\\u8896\\u4e2d\\u957f\\u6b3e\\u4e0a\\u8863\\u96ea\\u7ebaV\\u9886\\u8774\\u8776\\u7ed3\\u8fde\\u8863\\u88d9\\u590f\\u5b63"; char* result = Unescape(input); if (GetConsoleOutputCP() != CP_UTF8) SetConsoleOutputCP(CP_UTF8); if (result != NULL) { printf("原转义字符串:%s\n\n转义还原后的结果:%s\n", input, result); free(result); } return 0; }
In the Windows command prompt, the code page is set to 936 (GB-2321) or 65001 (UTF-8) and the operation is correct: