Utf8.h
1 /* Copyright (C) 2016 Marc Boris Duerner
2 
3  This library is free software; you can redistribute it and/or
4  modify it under the terms of the GNU Lesser General Public
5  License as published by the Free Software Foundation; either
6  version 2.1 of the License, or (at your option) any later version.
7 
8  As a special exception, you may use this file as part of a free
9  software library without restriction. Specifically, if other files
10  instantiate templates or use macros or inline functions from this
11  file, or you compile this file and link it with other files to
12  produce an executable, this file does not by itself cause the
13  resulting executable to be covered by the GNU General Public
14  License. This exception does not however invalidate any other
15  reasons why the executable file might be covered by the GNU Library
16  General Public License.
17 
18  This library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  Lesser General Public License for more details.
22 
23  You should have received a copy of the GNU Lesser General Public
24  License along with this library; if not, write to the Free Software
25  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26  MA 02110-1301 USA
27 */
28 
29 #ifndef Pt_Utf8_h
30 #define Pt_Utf8_h
31 
32 #include <Pt/Api.h>
33 #include <Pt/Utf8Codec.h>
34 #include <Pt/String.h>
35 #include <iterator>
36 #include <string>
37 
38 namespace Pt {
39 
58 class PT_API Utf8Iterator
59 {
60  public:
61  typedef Char value_type;
62  typedef std::ptrdiff_t difference_type;
63  typedef Char* pointer;
64  typedef const Char& reference;
65  typedef std::input_iterator_tag iterator_category;
66 
67  public:
71  : _utf8(0)
72  , _n(0)
73  { }
74 
77  explicit Utf8Iterator(const char* utf8, std::size_t n)
78  : _utf8(utf8)
79  , _n(n)
80  {
81  decode();
82  }
83 
86  explicit Utf8Iterator(const std::string& bytes)
87  : _utf8(bytes.data())
88  , _n(bytes.size())
89  {
90  decode();
91  }
92 
95  Utf8Iterator(const Utf8Iterator& other)
96  : _utf8(other._utf8)
97  , _value(other._value)
98  , _n(other._n)
99  {}
100 
104  {
105  _utf8 = other._utf8;
106  _value = other._value;
107  _n = other._n;
108  return *this;
109  }
110 
114  {
115  if(_n == 0)
116  _utf8 = 0;
117  else
118  decode();
119 
120  return *this;
121  }
122 
126  {
127  Utf8Iterator tmp = *this;
128  ++*this;
129  return tmp;
130  }
131 
134  const Char& operator*() const
135  { return _value; }
136 
139  bool operator!=(const Utf8Iterator& other) const
140  { return _utf8 != other._utf8; }
141 
144  bool operator==(const Utf8Iterator& other) const
145  { return _utf8 == other._utf8; }
146 
147  private:
148  void decode();
149 
150  private:
151  Utf8Codec _codec;
152  const char* _utf8;
153  std::size_t _n;
154  Char _value;
155 };
156 
174 class PT_API Utf8Appender
175 {
176  public:
177  typedef Char value_type;
178  typedef std::ptrdiff_t difference_type;
179  typedef Char* pointer;
180  typedef const Char& reference;
181  typedef std::output_iterator_tag iterator_category;
182 
183  public:
186  explicit Utf8Appender(std::string& str)
187  : _str(&str)
188  { }
189 
193  : _str(other._str)
194  { }
195 
199  {
200  _str = other._str;
201  return *this;
202  }
203 
207  {
208  encode(ch);
209  return *this;
210  }
211 
215  {
216  return *this;
217  }
218 
222  {
223  return *this;
224  }
225 
229  {
230  return *this;
231  }
232 
233  private:
234  void encode(const Char& ch);
235 
236  private:
237  Utf8Codec _codec;
238  std::string* _str;
239 };
240 
269 class PT_API Utf8Convert
270 {
271  public:
273 
274  public:
277  explicit Utf8Convert(TextCodec<Char, char>* codec);
278 
281  ~Utf8Convert();
282 
285  std::string toBytes(const char* utf8, std::size_t n);
286 
289  std::string toBytes(const std::string& bytes)
290  {
291  return toBytes(bytes.data(), bytes.size());
292  }
293 
296  std::string fromBytes(const char* bytes, std::size_t n);
297 
300  std::string fromBytes(const std::string& bytes)
301  {
302  return fromBytes(bytes.data(), bytes.size());
303  }
304 
305  private:
306  Utf8Convert(const Utf8Convert&);
307  Utf8Convert& operator=(const Utf8Convert&);
308 
309  private:
310  TextCodec<Char, char>* _codec;
311 
312  static const std::size_t _ibufSize = 16;
313  Char _ibuf[_ibufSize];
314 
315  static const std::size_t _ebufSize = 32;
316  char _ebuf[_ebufSize];
317 };
318 
319 } //namespace Pt
320 
321 #endif // include guard
UTF-8 string input iterator.
Definition: Utf8.h:58
Utf8Appender(const Utf8Appender &other)
Copy constructor.
Definition: Utf8.h:192
const Char & operator*() const
Returns current character.
Definition: Utf8.h:134
Utf8Iterator operator++(int)
Decodes next character.
Definition: Utf8.h:125
Utf8Appender & operator=(const Utf8Appender &other)
Assignment operator.
Definition: Utf8.h:198
Utf8Appender & operator*()
No-op.
Definition: Utf8.h:214
Utf8Iterator(const char *utf8, std::size_t n)
Construct from UTF-8 string data.
Definition: Utf8.h:77
Utf8Appender & operator++()
No-op.
Definition: Utf8.h:221
std::string fromBytes(const std::string &bytes)
Decode from external encoding.
Definition: Utf8.h:300
bool operator!=(const Utf8Iterator &other) const
Inequality comparison.
Definition: Utf8.h:139
Utf8Iterator()
Construct end iterator.
Definition: Utf8.h:70
Utf8Appender & operator=(const Char &ch)
Encodes a unicode character to the target string.
Definition: Utf8.h:206
Unicode character type.
Definition: String.h:66
std::string toBytes(const std::string &bytes)
Encode to external encoding.
Definition: Utf8.h:289
Utf8Iterator & operator=(const Utf8Iterator &other)
Assignment operator.
Definition: Utf8.h:103
Utf8Iterator(const Utf8Iterator &other)
Copy constructor.
Definition: Utf8.h:95
Utf8Appender(std::string &str)
Construct from UTF-8 encoded string.
Definition: Utf8.h:186
Utf8Appender operator++(int)
No-op.
Definition: Utf8.h:228
bool operator==(const Utf8Iterator &other) const
Equality comparison.
Definition: Utf8.h:144
UTF-8 string output iterator.
Definition: Utf8.h:174
Utf8Iterator & operator++()
Decodes next character.
Definition: Utf8.h:113
UTF-8 string converter.
Definition: Utf8.h:269
Utf8Iterator(const std::string &bytes)
Construct from UTF-8 encoded string.
Definition: Utf8.h:86
Convert between unicode and UTF-8.
Definition: Utf8Codec.h:43