And convert the string representation

    programming windows, there are several ways string representation, comprising:

    1) C-Style string char * / wchar_t *;

    2) C ++ STL string std :: string / std :: wstring;

    3) ATL / MFC string CStringA / CStringW;

    Multibyte (MBCS) above each version has a corresponding Unicode (DBCS), where there are different multi-byte encoding scheme used to distinguish the code page. In the windows system, the common code pages are:

    1) MS-DOS Latin US, code page 437, using a Latin character octets;

    2) windows code page 1251 for Cyrillic, using a Cyrillic character octets;

    3) code page 936/950/932/949 for ideographic, respectively, Simplified Chinese / Traditional Chinese / Japanese / Korean, use an eight-bit characters represent Latin characters, using eight characters represent two hieroglyphs;

    For pictograph, strlen function evaluation using the string length is meaningless, strlen function simply returns the number of the string to '\ 0' end of the previous character.

    Unicode two-byte character represents a front 128 consistent with the ASCII (American Standard Code for Information Interchange). When using non-pictograph when language, half of the space is wasted, so that based on the Unicode characters in coded form utf8. In this way, for the Latin, Cyrillic and other Unicode character representation requires only one code, and pictographs using 2,3,4-byte coding, see the specific blog "character set and encoding." In visual c ++ development, UTF8 encoding as a special code page, using the function WideCharToMultiByte (CP_UTF8, ...) can be obtained.

    

    Maintenance how two encoding schemes (MBCS & UNICODE) in a source file?

    winows system defined as follows:

 

 1 typedef wchar_t WCHAR;    // wc,   16-bit UNICODE character
 2 
 3 #ifdef  UNICODE   
 4 // PTCHAR is a pointer to WCHAR 
 5 typedef WCHAR TCHAR, *PTCHAR;
 6 #else
 7 // PTCHAR is a pointer to char
 8 typedef char TCHAR, *PTCHAR;
 9 #endif 
10 
11 #ifdef  _UNICODE
12 #define _tcscpy        wcscpy
13 #define _tcslen         wcslen
14 ...
15 #else
16 #define _tcscpy     strcpy
17 #define _tcslen     strlen
18 ...
19 #endif
20 
21 #ifdef  _UNICODE
22 #define __T(x)      L ## x
23 #define __TEXT(quote) L##quote 
24 #else
25 #define __T(x)      x
26 #define __TEXT(quote) quote
27 #endif
28 
29 // _T(x) __T(x) TEXT(x) _TEXT(x) __TEXT(x) same meaning
30 #define _T(x)       __T(x)
31 #define _TEXT(x)    __T(x)
32 #define TEXT(quote) __TEXT(quote)

    According to the above definition, can maintain a C-Style string using a source file, as follows:

1 TCHAR * _t = c ( " ch中国" );
2  int ylene = _tcslen (br);
3 TCHAR sz2 * = new TCHAR [ylene + 1 ];
4  _tcscpy (sz2, br);
5 delete [] P2;

 

    Selected under item Properties-> Configuration Prooperties-> General Use Multi-Byte Character Set (or Not Set), iLen value 6, which uses two bytes for each character; when selecting Use Unicode Character Set, iLen value 4, there are four characters represent sz.

    C ++ STL string ATL / MFC TCHAR character data string is encapsulated, as follows:

 

 1 // c++ string
 2 typedef basic_string<char, char_traits<char>, allocator<char> >
 3     string;
 4 typedef basic_string<wchar_t, char_traits<wchar_t>,
 5     allocator<wchar_t> > wstring;
 6 
 7 // atl/mfc string
 8 typedef ATL::CStringT< wchar_t, StrTraitMFC_DLL< wchar_t > > CStringW;
 9 typedef ATL::CStringT< char, StrTraitMFC_DLL< char > > CStringA;
10:: CStringT the ATL typedef <TCHAR, StrTraitMFC_DLL <TCHAR>> CString;
 . 11  
12 is  // Since c ++ string does not provide a TCHAR version, by adding the following definitions,
 13  // multi-byte Unicode autochanger 
14  #ifdef the _UNICODE
 15  # DEFINE tString STD :: wstring
 16  #else 
. 17  #define tString STD :: String
 18 is  #endif

 

    By the above definition, the use of tString (custom), two types of character string can maintain a CString in a source file. windows also allow the use of a particular string of characters, is provided to convert between MBCS / Unicode function , as follows:

 

. 1  // char * -> * wchar_t
 2  // char * sz1 = "CH Chinese";
 3  // Get sz1 converted to Unicode string length 
. 4  int iLen the MultiByteToWideChar = (CP_ACP, 0 , sz1, strlen (sz1), NULL , 0 );
 . 5 wchar_t * SZ2 = new new wchar_t [iLen + . 1 ];
 . 6 the MultiByteToWideChar (CP_ACP, 0 , SZ1, strlen (SZ1), SZ2, iLen);
 . 7 SZ2 [iLen] = ' \ 0 ' ;   // to '\ 0' end of the string
 . 8  
. 9  // wchar_t * -> char * 
10 wchar_t SZ3 = L *" CH Chinese " ;
 11  // Get SZ3 string into a multi-byte length 
12 is  int iLen2 the WideCharToMultiByte = (CP_ACP, 0 , SZ3, wcslen (SZ3), NULL, 0 , NULL, NULL);
 13 is  char * SZ4 = new new  char [iLen2 + . 1 ];
 14 the WideCharToMultiByte (CP_ACP, 0 , SZ3, wcslen (SZ3), SZ4, iLen2, NULL, NULL);
 15 SZ4 [iLen2] = ' \ 0 ' ; // without using a flag L, since a single The system can be zero-padded character
 16  
17  // CStringA -> CStringW 
18 CStringA SZA ( "ch China " );
 . 19  CStringW SZW (SZA);
 20 is  
21 is  // CStringW -> CStringA 
22 is CStringW szW2 (L " ch China " );
 23 is  CStringA szA2 (szW2);
 24  
25  
26 is  // Unicode -> UTF8 -> the Unicode
 27  // VS CP_ACP translation using multi-byte default, when you need to view the utf8 encoded string, can be added in the watch window, s8 view 
28 wchar_t * szUtf16 = L " CH China " ;
 29  int len = the WideCharToMultiByte (CP_UTF8, 0 , szUtf16, wcslen (szUtf16), NULL, 0 , NULL, NULL);
 30  char *szUtf8 = new char[len + 1];
31 WideCharToMultiByte(CP_UTF8, 0, szUtf16, wcslen(szUtf16), szUtf8, len, NULL, NULL);
32 szUtf8[len] = '\0';
33 int len2 = MultiByteToWideChar(CP_UTF8, 0, szUtf8, strlen(szUtf8), NULL, 0);
34 wchar_t *szUtf16_2 = new wchar_t[len2 + 1];
35 MultiByteToWideChar(CP_UTF8, 0, szUtf8, strlen(szUtf8), szUtf16_2, len2);
36 szUtf16_2[len2] = '\0';

    The programming process often use different types of strings, how to see it convert different types of string? code show as below:

 

 1 // TCHAR* <-> CString
 2 TCHAR *sz = _T("ch中国");
 3 CString cstr(sz);
 4 TCHAR* sz2 = cstr.GetBuffer();
 5 
 6 // TCHAR* <-> tsring
 7 tstring tstr(sz);
 8 const TCHAR* sz3 = tstr.c_str();  // tstr.data() is also ok
 9 
10 // CString <-> tstring
11 CString cstr2;
12 tstring tstr2;
13 tstr2 = cstr.GetBuffer();
14 cstr2 = tstr.c_str();

 

Guess you like

Origin www.cnblogs.com/luofeiju/p/12501873.html