You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
6.8 KiB

/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
file Copyright.txt or https://cmake.org/licensing for details. */
#include "cm_codecvt.hxx"
#if defined(_WIN32)
# include <cassert>
# include <cstring>
# include <windows.h>
# undef max
# include "cmsys/Encoding.hxx"
# include "cm_utf8.h"
#endif
codecvt::codecvt(Encoding e)
#if defined(_WIN32)
: m_codepage(0)
#endif
{
switch (e) {
case codecvt::ConsoleOutput:
#if defined(_WIN32)
m_noconv = false;
m_codepage = GetConsoleOutputCP();
break;
#endif
case codecvt::ANSI:
#if defined(_WIN32)
m_noconv = false;
m_codepage = CP_ACP;
break;
#endif
// We don't know which ANSI encoding to use for other platforms than
// Windows so we don't do any conversion there
case codecvt::UTF8:
case codecvt::UTF8_WITH_BOM:
// Assume internal encoding is UTF-8
case codecvt::None:
// No encoding
default:
this->m_noconv = true;
}
}
codecvt::~codecvt() = default;
bool codecvt::do_always_noconv() const noexcept
{
return this->m_noconv;
}
std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
const char* from_end,
const char*& from_next, char* to,
char* to_end, char*& to_next) const
{
from_next = from;
to_next = to;
if (this->m_noconv) {
return std::codecvt_base::noconv;
}
#if defined(_WIN32)
// Use a const view of the state because we should not modify it until we
// have fully processed and consume a byte (with sufficient space in the
// output buffer). We call helpers to re-cast and modify the state
State const& lstate = reinterpret_cast<State&>(state);
while (from_next != from_end) {
// Count leading ones in the bits of the next byte.
unsigned char const ones =
cm_utf8_ones[static_cast<unsigned char>(*from_next)];
if (ones != 1 && lstate.buffered != 0) {
// We have a buffered partial codepoint that we never completed.
return std::codecvt_base::error;
} else if (ones == 1 && lstate.buffered == 0) {
// This is a continuation of a codepoint that never started.
return std::codecvt_base::error;
}
// Compute the number of bytes in the current codepoint.
int need = 0;
switch (ones) {
case 0: // 0xxx xxxx: new codepoint of size 1
need = 1;
break;
case 1: // 10xx xxxx: continues a codepoint
assert(lstate.size != 0);
need = lstate.size;
break;
case 2: // 110x xxxx: new codepoint of size 2
need = 2;
break;
case 3: // 1110 xxxx: new codepoint of size 3
need = 3;
break;
case 4: // 1111 0xxx: new codepoint of size 4
need = 4;
break;
default: // invalid byte
return std::codecvt_base::error;
}
assert(need > 0);
if (lstate.buffered + 1 == need) {
// This byte completes a codepoint.
std::codecvt_base::result decode_result =
this->Decode(state, need, from_next, to_next, to_end);
if (decode_result != std::codecvt_base::ok) {
return decode_result;
}
} else {
// This byte does not complete a codepoint.
this->BufferPartial(state, need, from_next);
}
}
return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(from);
static_cast<void>(from_end);
static_cast<void>(from_next);
static_cast<void>(to);
static_cast<void>(to_end);
static_cast<void>(to_next);
return std::codecvt_base::noconv;
#endif
}
std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
char* to_end,
char*& to_next) const
{
to_next = to;
if (this->m_noconv) {
return std::codecvt_base::noconv;
}
#if defined(_WIN32)
State& lstate = reinterpret_cast<State&>(state);
if (lstate.buffered != 0) {
return this->DecodePartial(state, to_next, to_end);
}
return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(to_end);
return std::codecvt_base::ok;
#endif
}
#if defined(_WIN32)
std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
const char*& from_next,
char*& to_next, char* to_end) const
{
State& lstate = reinterpret_cast<State&>(state);
// Collect all the bytes for this codepoint.
char buf[4];
memcpy(buf, lstate.partial, lstate.buffered);
buf[lstate.buffered] = *from_next;
// Convert the encoding.
wchar_t wbuf[2];
int wlen =
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
if (wlen <= 0) {
return std::codecvt_base::error;
}
int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
to_end - to_next, nullptr, nullptr);
if (tlen <= 0) {
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
return std::codecvt_base::partial;
}
return std::codecvt_base::error;
}
// Move past the now-consumed byte in the input buffer.
++from_next;
// Move past the converted codepoint in the output buffer.
to_next += tlen;
// Re-initialize the state for the next codepoint to start.
lstate = State();
return std::codecvt_base::ok;
}
std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
char*& to_next,
char* to_end) const
{
State& lstate = reinterpret_cast<State&>(state);
// Try converting the partial codepoint.
wchar_t wbuf[2];
int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
lstate.buffered, wbuf, 2);
if (wlen <= 0) {
return std::codecvt_base::error;
}
int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
to_end - to_next, nullptr, nullptr);
if (tlen <= 0) {
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
return std::codecvt_base::partial;
}
return std::codecvt_base::error;
}
// Move past the converted codepoint in the output buffer.
to_next += tlen;
// Re-initialize the state for the next codepoint to start.
lstate = State();
return std::codecvt_base::ok;
}
void codecvt::BufferPartial(mbstate_t& state, int size,
const char*& from_next) const
{
State& lstate = reinterpret_cast<State&>(state);
// Save the byte in our buffer for later.
lstate.partial[lstate.buffered++] = *from_next;
lstate.size = size;
// Move past the now-consumed byte in the input buffer.
++from_next;
}
#endif
int codecvt::do_max_length() const noexcept
{
return 4;
}
int codecvt::do_encoding() const noexcept
{
return 0;
}