Jsoncpp Chinese character unicode garbled solution and punctuation problem solution

Because the JSON data structure is used in the project, the client is written in VC, and the server data interface is written in PHP. After encountering the JSON Chinese Unicode encoding, the client is garbled.

There is a better method on the Internet, which is to modify the codePointToUTF8 function in json_reader.cpp.

Modified to the following:

static inline JSONCPP_STRING codePointToUTF8(unsigned int cp) {
  JSONCPP_STRING result;

  // based on description from http://en.wikipedia.org/wiki/UTF-8

  if (cp <= 0x7f) {
    result.resize(1);
    result[0] = static_cast<char>(cp);
  } else if (cp <= 0x7FF) {
    result.resize(2);
    result[1] = static_cast<char>(0x80 | (0x3f & cp));
    result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
  } else if (cp <= 0xFFFF) {
	  if ((cp >= 0x4E00 && cp <= 0x9FA5) || (cp >= 0xF900 && cp <= 0xFA2D))
    {
		wchar_t src[2] = { 0 };
		char dest [5] = {0};
		src[0] = static_cast<wchar_t>(cp);
		std::string curLocale = setlocale(LC_ALL, NULL);
		setlocale(LC_ALL, "chs");
		wcstombs_s(NULL, dest, 5, src, 2);
		result = dest;
		setlocale(LC_ALL, curLocale.c_str());
	}
	else
	{
		result.resize(3);
		result[2] = static_cast<char>(0x80 | (0x3f & cp));
		//result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
		//result[0] = static_cast<char>(0xE0 | (0xf & (cp >> 12)));
		result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
		result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
	}
  } else if (cp <= 0x10FFFF) {
    result.resize(4);
    result[3] = static_cast<char>(0x80 | (0x3f & cp));
    result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
    result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
    result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
  }

  return result;
}

After the above modification, Chinese can be displayed for sure, but Chinese punctuation marks are still garbled, so I searched for the Unicode encoding of Chinese punctuation, as long as the processing of Chinese characters is added to the processing of punctuation encoding.

period 0x3002.   
Question mark 0xFF1F?   
Exclamation mark 0xFF01 !   
Comma 0xFF0C,
comma 0x3001,   
semicolon 0xFF1B;   
colon 0xFF1A:   
quotes 0x300C "   
      0x300D"   
quotes 0x300E "   
      0x300F"   
quotes 0x2018 '   
      0x2019'   
quotes 0x201C "   
      0x201D"   
brackets 0xFF08 (   
      0xFF09)   
brackets [0x3014   
      0x3015]   
brackets 0x3010 [   
      0x3011]   
dash 0x2014 –   
ellipsis 0x2026 …   
connection number 0x2013 –   
spacer number 0xFF0E .   
Book title number 0x300A "   
       0x300B 》   
Book title number 0x3008 〈   
       0x3009 〉


The final modified function is:


static inline JSONCPP_STRING codePointToUTF8(unsigned int cp) {
  JSONCPP_STRING result;

  // based on description from http://en.wikipedia.org/wiki/UTF-8

  if (cp <= 0x7f) {
    result.resize(1);
    result[0] = static_cast<char>(cp);
  } else if (cp <= 0x7FF) {
    result.resize(2);
    result[1] = static_cast<char>(0x80 | (0x3f & cp));
    result[0] = static_cast<char>(0xC0 | (0x1f & (cp >> 6)));
  } else if (cp <= 0xFFFF) {
	  if ((cp >= 0x4E00 && cp <= 0x9FA5) || (cp >= 0xF900 && cp <= 0xFA2D) || cp == 0x3002 || cp == 0xFF1F || cp == 0xFF01 || cp == 0xFF0C || cp == 0x3001 || cp == 0xFF1B || cp == 0xFF1A || cp == 0x300C || cp == 0x300D || cp == 0x300E || cp == 0x300F || cp == 0x2018 || cp == 0x2019 || cp == 0x201C || cp == 0x201D || cp == 0xFF08 || cp == 0xFF09 || cp == 0x3014 || cp == 0x3015 || cp == 0x3010 || cp == 0x3011 || cp == 0x2014 || cp == 0x2026 || cp == 0x2013 || cp == 0xFF0E || cp == 0x300A || cp == 0x300B || cp == 0x3008 || cp == 0x3009)
    {
		wchar_t src[2] = { 0 };
		char dest [5] = {0};
		src[0] = static_cast<wchar_t>(cp);
		std::string curLocale = setlocale(LC_ALL, NULL);
		setlocale(LC_ALL, "chs");
		wcstombs_s(NULL, dest, 5, src, 2);
		result = dest;
		setlocale(LC_ALL, curLocale.c_str());
	}
	else
	{
		result.resize(3);
		result[2] = static_cast<char>(0x80 | (0x3f & cp));
		//result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
		//result[0] = static_cast<char>(0xE0 | (0xf & (cp >> 12)));
		result[1] = 0x80 | static_cast<char>((0x3f & (cp >> 6)));
		result[0] = 0xE0 | static_cast<char>((0xf & (cp >> 12)));
	}
  } else if (cp <= 0x10FFFF) {
    result.resize(4);
    result[3] = static_cast<char>(0x80 | (0x3f & cp));
    result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6)));
    result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12)));
    result[0] = static_cast<char>(0xF0 | (0x7 & (cp >> 18)));
  }

  return result;
}


It can perfectly solve the problem of Chinese garbled characters.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324881745&siteId=291194637