2012-11-13 22:01:12 +02:00
|
|
|
#ifndef _library__utf8__hpp__included__
|
|
|
|
#define _library__utf8__hpp__included__
|
2012-06-20 17:40:27 +03:00
|
|
|
|
2013-03-25 00:29:48 +02:00
|
|
|
#include <iostream>
|
2012-06-20 17:40:27 +03:00
|
|
|
#include <cstdint>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <string>
|
2013-06-15 22:26:11 +03:00
|
|
|
#include <functional>
|
2012-06-20 17:40:27 +03:00
|
|
|
|
2013-12-20 12:39:24 +02:00
|
|
|
namespace utf8
|
|
|
|
{
|
2012-06-20 17:40:27 +03:00
|
|
|
/**
|
|
|
|
* Initial state for UTF-8 parser.
|
|
|
|
*/
|
2013-12-20 12:39:24 +02:00
|
|
|
extern const uint16_t initial_state;
|
2012-06-20 17:40:27 +03:00
|
|
|
/**
|
|
|
|
* Parse a byte.
|
|
|
|
*
|
|
|
|
* Parameter ch: The character to parse. -1 for end of string.
|
|
|
|
* Parameter state: The state. Mutated.
|
|
|
|
* Returns: The codepoint, or -1 if no codepoint emitted.
|
|
|
|
*
|
|
|
|
* Note: When called with EOF, max 1 codepoint can be emitted.
|
|
|
|
*/
|
2013-12-20 12:39:24 +02:00
|
|
|
int32_t parse_byte(int ch, uint16_t& state) throw();
|
2012-06-20 17:40:27 +03:00
|
|
|
/**
|
|
|
|
* Return length of string in UTF-8 codepoints.
|
|
|
|
*
|
|
|
|
* Parameter str: The string.
|
|
|
|
* Returns: The length in codepoints.
|
|
|
|
*/
|
2013-12-20 12:39:24 +02:00
|
|
|
size_t strlen(const std::string& str) throw();
|
2012-06-20 17:40:27 +03:00
|
|
|
|
2013-03-25 00:29:48 +02:00
|
|
|
/**
|
|
|
|
* Transform UTF-8 into UTF-32.
|
|
|
|
*/
|
2013-12-20 12:39:24 +02:00
|
|
|
std::u32string to32(const std::string& utf8);
|
2013-03-25 00:29:48 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Transform UTF-32 into UTF-8.
|
|
|
|
*/
|
2013-12-20 12:39:24 +02:00
|
|
|
std::string to8(const std::u32string& utf32);
|
2013-03-25 00:29:48 +02:00
|
|
|
|
2013-03-13 01:02:10 +02:00
|
|
|
/**
|
2013-12-21 16:36:33 +02:00
|
|
|
* Iterator copy from UTF-8 to UTF-32
|
2013-03-13 01:02:10 +02:00
|
|
|
*/
|
2013-12-21 16:36:33 +02:00
|
|
|
template<typename srcitr, typename dstitr>
|
|
|
|
inline void to32i(srcitr begin, srcitr end, dstitr target)
|
2013-03-13 01:02:10 +02:00
|
|
|
{
|
2013-12-20 12:39:24 +02:00
|
|
|
uint16_t state = initial_state;
|
2013-03-13 01:02:10 +02:00
|
|
|
for(srcitr i = begin; i != end; i++) {
|
2013-12-20 12:39:24 +02:00
|
|
|
int32_t x = parse_byte((unsigned char)*i, state);
|
2013-12-21 16:36:33 +02:00
|
|
|
if(x >= 0) {
|
|
|
|
*target = x;
|
|
|
|
++target;
|
|
|
|
}
|
2013-03-13 01:02:10 +02:00
|
|
|
}
|
2013-12-20 12:39:24 +02:00
|
|
|
int32_t x = parse_byte(-1, state);
|
2013-12-21 16:36:33 +02:00
|
|
|
if(x >= 0) {
|
|
|
|
*target = x;
|
|
|
|
++target;
|
|
|
|
}
|
2013-06-15 22:26:11 +03:00
|
|
|
}
|
|
|
|
|
2013-12-20 12:39:24 +02:00
|
|
|
}
|
2012-06-20 17:40:27 +03:00
|
|
|
#endif
|