taskwarrior/src/utf8.cpp
Paul Beckingham 57106c86a9 UTF8
- Merged libexpr changes.
2014-01-02 00:26:23 -05:00

300 lines
8.3 KiB
C++

////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2013 - 2014, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#include <cmake.h>
#include <string>
#include <text.h>
#include <utf8.h>
#include <i18n.h>
////////////////////////////////////////////////////////////////////////////////
// Converts '0' -> 0
// '9' -> 9
// 'a'/'A' -> 10
// 'f'/'F' -> 15
#define XDIGIT(x) ((x) >= '0' && (x) <= '9' ? ((x) - '0') : \
(x) >= 'a' && (x) <= 'f' ? ((x) + 10 - 'a') : \
(x) >= 'A' && (x) <= 'F' ? ((x) + 10 - 'A') : 0)
////////////////////////////////////////////////////////////////////////////////
// Note: Assumes 4-digit hex codepoints:
// xxxx
// \uxxxx
// U+xxxx
unsigned int utf8_codepoint (const std::string& input)
{
unsigned int codepoint = 0;
int length = input.length ();
// U+xxxx, \uxxxx
if (length >= 6 &&
((input[0] == 'U' && input[1] == '+') ||
(input[0] == '\\' && input[1] == 'u')))
{
codepoint = XDIGIT (input[2]) << 12 |
XDIGIT (input[3]) << 8 |
XDIGIT (input[4]) << 4 |
XDIGIT (input[5]);
}
else if (length >= 4)
{
codepoint = XDIGIT (input[0]) << 12 |
XDIGIT (input[1]) << 8 |
XDIGIT (input[2]) << 4 |
XDIGIT (input[3]);
}
else
throw std::string (STRING_UTF8_INVALID_CP_REP);
return codepoint;
}
////////////////////////////////////////////////////////////////////////////////
// Iterates along a UTF8 string.
// - argument i counts bytes advanced through the string
// - returns the next character
unsigned int utf8_next_char (const std::string& input, std::string::size_type& i)
{
if (input[i] == '\0')
return 0;
// How many bytes in the sequence?
int length = utf8_sequence (input[i]);
i += length;
// 0xxxxxxx -> 0xxxxxxx
if (length == 1)
return input[i - 1];
// 110yyyyy 10xxxxxx -> 00000yyy yyxxxxxx
if (length == 2)
return ((input[i - 2] & 0x1F) << 6) +
(input[i - 1] & 0x3F);
// 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyy yyxxxxxx
if (length == 3)
return ((input[i - 3] & 0xF) << 12) +
((input[i - 2] & 0x3F) << 6) +
(input[i - 1] & 0x3F);
// 11110www 10zzzzzz 10yyyyyy 10xxxxxx -> 000wwwzz zzzzyyyy yyxxxxxx
if (length == 4)
return ((input[i - 4] & 0x7) << 18) +
((input[i - 3] & 0x3F) << 12) +
((input[i - 2] & 0x3F) << 6) +
(input[i - 1] & 0x3F);
// Default: pretend as though it's a single character.
// TODO Or should this throw?
return input[i - 1];
}
////////////////////////////////////////////////////////////////////////////////
// http://en.wikipedia.org/wiki/UTF-8
std::string utf8_character (unsigned int codepoint)
{
char sequence[5];
// 0xxxxxxx -> 0xxxxxxx
if (codepoint < 0x80)
{
sequence[0] = codepoint;
sequence[1] = 0;
}
// 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx
else if (codepoint < 0x800)
{
sequence[0] = 0xC0 | (codepoint & 0x7C0) >> 6;
sequence[1] = 0x80 | (codepoint & 0x3F);
sequence[2] = 0;
}
// zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
else if (codepoint < 0x10000)
{
sequence[0] = 0xE0 | (codepoint & 0xF000) >> 12;
sequence[1] = 0x80 | (codepoint & 0xFC0) >> 6;
sequence[2] = 0x80 | (codepoint & 0x3F);
sequence[3] = 0;
}
// 000wwwzz zzzzyyyy yyxxxxxx -> 11110www 10zzzzzz 10yyyyyy 10xxxxxx
else if (codepoint < 0x110000)
{
sequence[0] = 0xF0 | (codepoint & 0x1C0000) >> 18;
sequence[1] = 0x80 | (codepoint & 0x03F000) >> 12;
sequence[2] = 0x80 | (codepoint & 0x0FC0) >> 6;
sequence[3] = 0x80 | (codepoint & 0x3F);
sequence[4] = 0;
}
else
throw std::string (STRING_UTF8_INVALID_CP);
sequence[4] = '\0';
return std::string (sequence);
}
////////////////////////////////////////////////////////////////////////////////
int utf8_sequence (unsigned int character)
{
if ((character & 0xE0) == 0xC0)
return 2;
if ((character & 0xF0) == 0xE0)
return 3;
if ((character & 0xF8) == 0xF0)
return 4;
return 1;
}
////////////////////////////////////////////////////////////////////////////////
// Length of a string in characters.
unsigned int utf8_length (const std::string& str)
{
int byteLength = str.length ();
int charLength = byteLength;
const char* data = str.data ();
// Decrement the number of bytes for each byte that matches 0b10??????
// this way only the first byte of any utf8 sequence is counted.
for (int i = 0; i < byteLength; i++)
{
// Extract the first two bits and check whether they are 10
if ((data[i] & 0xC0) == 0x80)
charLength--;
}
return charLength;
}
////////////////////////////////////////////////////////////////////////////////
// Width of a string in character cells.
unsigned int utf8_width (const std::string& str)
{
unsigned int length = 0;
std::string::size_type i = 0;
unsigned int c;
while ((c = utf8_next_char (str, i)))
length += mk_wcwidth (c);
return length;
}
////////////////////////////////////////////////////////////////////////////////
unsigned int utf8_text_length (const std::string& str)
{
int byteLength = str.length ();
int charLength = byteLength;
const char* data = str.data ();
bool in_color = false;
// Decrement the number of bytes for each byte that matches 0b10??????
// this way only the first byte of any utf8 sequence is counted.
for (int i = 0; i < byteLength; i++)
{
if (in_color)
{
if (data[i] == 'm')
in_color = false;
--charLength;
}
else
{
if (data[i] == 033)
{
in_color = true;
--charLength;
}
else
{
// Extract the first two bits and check whether they are 10
if ((data[i] & 0xC0) == 0x80)
--charLength;
}
}
}
return charLength;
}
////////////////////////////////////////////////////////////////////////////////
unsigned int utf8_text_width (const std::string& str)
{
bool in_color = false;
unsigned int length = 0;
std::string::size_type i = 0;
unsigned int c;
while ((c = utf8_next_char (str, i)))
{
if (in_color)
{
if (c == 'm')
in_color = false;
}
else if (c == 033)
{
in_color = true;
}
else
length += mk_wcwidth (c);
}
return length;
}
////////////////////////////////////////////////////////////////////////////////
const std::string utf8_substr (
const std::string& input,
unsigned int start,
unsigned int length /*=0*/)
{
// Find the starting index.
std::string::size_type index_start = 0;
for (unsigned int i = 0; i < start; i++)
utf8_next_char (input, index_start);
std::string result;
if (length)
{
std::string::size_type index_end = index_start;
for (unsigned int i = 0; i < length; i++)
utf8_next_char (input, index_end);
result = input.substr (index_start, index_end - index_start);
}
else
result = input.substr (index_start);
return result;
}
////////////////////////////////////////////////////////////////////////////////