Unicode.cpp
1 //
3 // SFML - Simple and Fast Multimedia Library
4 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
5 //
6 // This software is provided 'as-is', without any express or implied warranty.
7 // In no event will the authors be held liable for any damages arising from the use of this software.
8 //
9 // Permission is granted to anyone to use this software for any purpose,
10 // including commercial applications, and to alter it and redistribute it freely,
11 // subject to the following restrictions:
12 //
13 // 1. The origin of this software must not be misrepresented;
14 // you must not claim that you wrote the original software.
15 // If you use this software in a product, an acknowledgment
16 // in the product documentation would be appreciated but is not required.
17 //
18 // 2. Altered source versions must be plainly marked as such,
19 // and must not be misrepresented as being the original software.
20 //
21 // 3. This notice may not be removed or altered from any source distribution.
22 //
24 
26 // Headers
28 #include <SFML/System/Unicode.hpp>
29 #include <stdexcept>
30 #include <string.h>
31 
32 
34 // References :
35 //
36 // http://www.unicode.org/
37 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
38 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
39 // http://people.w3.org/rishida/scripts/uniview/conversion
40 //
42 
43 namespace
44 {
46  // Generic utility function to compute the number
47  // of characters in a null-terminated string of any type
49  template <typename T>
50  std::size_t StrLen(const T* Str)
51  {
52  std::size_t Length = 0;
53  while (*Str++) Length++;
54  return Length;
55  }
56 
58  // Get the current system locale
60  std::locale GetCurrentLocale()
61  {
62  try
63  {
64  return std::locale("");
65  }
66  catch (std::runtime_error&)
67  {
68  // It seems some implementations don't know the "" locale
69  // (Mac OS, MinGW)
70 
71  return std::locale();
72  }
73  }
74 }
75 
76 namespace sf
77 {
79 // Static member data
81 const int Unicode::UTF8TrailingBytes[256] =
82 {
83  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
91 };
92 const Uint32 Unicode::UTF8Offsets[6] =
93 {
94  0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
95 };
96 const Uint8 Unicode::UTF8FirstBytes[7] =
97 {
98  0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
99 };
100 
101 
106 {
107  // Nothing to do
108 }
109 
110 
114 Unicode::Text::Text(const char* Str)
115 {
116  if (Str)
117  {
118  std::size_t Length = StrLen(Str);
119  if (Length > 0)
120  {
121  myUTF32String.reserve(Length + 1);
122  Unicode::ANSIToUTF32(Str, Str + Length, std::back_inserter(myUTF32String));
123  }
124  }
125 }
126 Unicode::Text::Text(const wchar_t* Str)
127 {
128  if (Str)
129  {
130  std::size_t Length = StrLen(Str);
131  if (Length > 0)
132  {
133  // See comments below, in Unicode::Text::Text(const std::wstring&)
134  myUTF32String.reserve(Length + 1);
135  switch (sizeof(wchar_t))
136  {
137  case 2 : Unicode::UTF16ToUTF32(Str, Str + Length, std::back_inserter(myUTF32String), 0); break;
138  case 4 : std::copy(Str, Str + Length, std::back_inserter(myUTF32String)); break;
139  default : break;
140  }
141  }
142  }
143 }
144 Unicode::Text::Text(const Uint8* Str)
145 {
146  if (Str)
147  {
148  std::size_t Length = StrLen(Str);
149  if (Length > 0)
150  {
151  myUTF32String.reserve(Length + 1);
152  Unicode::UTF8ToUTF32(Str, Str + Length, std::back_inserter(myUTF32String), 0);
153  }
154  }
155 }
156 Unicode::Text::Text(const Uint16* Str)
157 {
158  if (Str)
159  {
160  std::size_t Length = StrLen(Str);
161  if (Length > 0)
162  {
163  myUTF32String.reserve(Length+ 1);
164  Unicode::UTF16ToUTF32(Str, Str + Length, std::back_inserter(myUTF32String), 0);
165  }
166  }
167 }
168 Unicode::Text::Text(const Uint32* Str)
169 {
170  if (Str)
171  myUTF32String = Str;
172 }
173 Unicode::Text::Text(const std::string& Str)
174 {
175  myUTF32String.reserve(Str.length() + 1);
176  Unicode::ANSIToUTF32(Str.begin(), Str.end(), std::back_inserter(myUTF32String));
177 }
178 Unicode::Text::Text(const std::wstring& Str)
179 {
180  // This function assumes that 2-byte large wchar_t are encoded in UTF-16 (Windows), and
181  // 4-byte large wchar_t are encoded using UTF-32 (Unix)
182  // Is that always true ? (some platforms may use JIS Japanese encoding)
183  // The macro __STDC_ISO_10646__ should help identifying UTF-32 compliant implementations
184 
185  myUTF32String.reserve(Str.length() + 1);
186 
187  // Select the proper function according to the (supposed) wchar_t system encoding
188  switch (sizeof(wchar_t))
189  {
190  // wchar_t uses UTF-16 -- need a conversion
191  case 2 :
192  {
193  Unicode::UTF16ToUTF32(Str.begin(), Str.end(), std::back_inserter(myUTF32String), 0);
194  break;
195  }
196 
197  // wchar_t uses UTF-32 -- direct copy
198  case 4 :
199  {
200  std::copy(Str.begin(), Str.end(), std::back_inserter(myUTF32String));
201  break;
202  }
203 
204  // This should never happen
205  default : break;
206  }
207 }
209 {
210  myUTF32String.reserve(Str.length() + 1);
211  Unicode::UTF8ToUTF32(Str.begin(), Str.end(), std::back_inserter(myUTF32String), 0);
212 }
213 Unicode::Text::Text(const Unicode::UTF16String& Str)
214 {
215  myUTF32String.reserve(Str.length() + 1);
216  Unicode::UTF16ToUTF32(Str.begin(), Str.end(), std::back_inserter(myUTF32String), 0);
217 }
218 Unicode::Text::Text(const Unicode::UTF32String& Str)
219 {
220  myUTF32String = Str;
221 }
222 
223 
227 Unicode::Text::operator std::string() const
228 {
229  std::string Output;
230  Output.reserve(myUTF32String.length() + 1);
231  Unicode::UTF32ToANSI(myUTF32String.begin(), myUTF32String.end(), std::back_inserter(Output), 0, Unicode::GetDefaultLocale());
232  return Output;
233 }
234 Unicode::Text::operator std::wstring() const
235 {
236  // This function assumes that 2-byte large wchar_t are encoded in UTF-16 (Windows), and
237  // 4-byte large wchar_t are encoded using UTF-32 (Unix)
238  // Is that always true ? (some platforms may use JIS Japanese encoding)
239  // The macro __STDC_ISO_10646__ should help identifying UTF-32 compliant implementations
240 
241  std::wstring Output;
242  Output.reserve(myUTF32String.length() + 1);
243 
244  // Select the proper function according to the (supposed) wchar_t system encoding
245  switch (sizeof(wchar_t))
246  {
247  // wchar_t uses UTF-16 -- need a conversion
248  case 2 :
249  {
250  UTF32ToUTF16(myUTF32String.begin(), myUTF32String.end(), std::back_inserter(Output), 0);
251  break;
252  }
253 
254  // wchar_t uses UTF-32 -- direct copy
255  case 4 :
256  {
257  std::copy(myUTF32String.begin(), myUTF32String.end(), std::back_inserter(Output));
258  break;
259  }
260 
261  // This should never happen
262  default : break;
263  }
264  return Output;
265 }
266 Unicode::Text::operator sf::Unicode::UTF8String() const
267 {
268  Unicode::UTF8String Output;
269  Output.reserve(myUTF32String.length() * 4 + 1);
270  Unicode::UTF32ToUTF8(myUTF32String.begin(), myUTF32String.end(), std::back_inserter(Output), 0);
271  return Output;
272 }
273 Unicode::Text::operator sf::Unicode::UTF16String() const
274 {
275  Unicode::UTF16String Output;
276  Output.reserve(myUTF32String.length() * 2 + 1);
277  Unicode::UTF32ToUTF16(myUTF32String.begin(), myUTF32String.end(), std::back_inserter(Output), 0);
278  return Output;
279 }
280 Unicode::Text::operator const sf::Unicode::UTF32String&() const
281 {
282  return myUTF32String;
283 }
284 
285 
289 const std::locale& Unicode::GetDefaultLocale()
290 {
291  // It seems getting the default locale is a very expensive operation,
292  // so we only do it once and then store the locale for reuse.
293  // Warning : this code won't be aware of any change of the default locale during runtime
294 
295  static std::locale DefaultLocale = GetCurrentLocale();
296 
297  return DefaultLocale;
298 }
299 
300 } // namespace sf
static Out UTF32ToUTF8(In Begin, In End, Out Output, Uint8 Replacement= '?')
Generic function to convert an UTF-32 characters range to an UTF-8 characters range, using the given locale.
static Out ANSIToUTF32(In Begin, In End, Out Output, const std::locale &Locale=GetDefaultLocale())
Generic function to convert an ANSI characters range to an UTF-32 characters range, using the given locale.
Text()
Default constructor (empty text)
Definition: Unicode.cpp:105
std::basic_string< Uint8 > UTF8String
Define a string type for each encoding Warning : in UTF8 and UTF16 strings, one element doesn&#39;t neces...
Definition: Unicode.hpp:54
static Out UTF16ToUTF32(In Begin, In End, Out Output, Uint32 Replacement= '?')
Generic function to convert an UTF-16 characters range to an UTF-32 characters range, using the given locale.
static Out UTF32ToUTF16(In Begin, In End, Out Output, Uint16 Replacement= '?')
Generic function to convert an UTF-32 characters range to an UTF-16 characters range, using the given locale.
static Out UTF8ToUTF32(In Begin, In End, Out Output, Uint32 Replacement= '?')
Generic function to convert an UTF-8 characters range to an UTF-32 characters range, using the given locale.
static Out UTF32ToANSI(In Begin, In End, Out Output, char Replacement= '?', const std::locale &Locale=GetDefaultLocale())
Generic function to convert an UTF-32 characters range to an ANSI characters range, using the given locale.