#include "stream.h" #include #include "exp.h" #ifndef YAML_PREFETCH_SIZE #define YAML_PREFETCH_SIZE 2048 #endif #define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A))) #define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A)) #define CP_REPLACEMENT_CHARACTER (0xFFFD) namespace YAML { enum UtfIntroState { uis_start, uis_utfbe_b1, uis_utf32be_b2, uis_utf32be_bom3, uis_utf32be, uis_utf16be, uis_utf16be_bom1, uis_utfle_bom1, uis_utf16le_bom2, uis_utf32le_bom3, uis_utf16le, uis_utf32le, uis_utf8_imp, uis_utf16le_imp, uis_utf32le_imp3, uis_utf8_bom1, uis_utf8_bom2, uis_utf8, uis_error }; enum UtfIntroCharType { uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther, uictMax }; static bool s_introFinalState[] = { false, //uis_start false, //uis_utfbe_b1 false, //uis_utf32be_b2 false, //uis_utf32be_bom3 true, //uis_utf32be true, //uis_utf16be false, //uis_utf16be_bom1 false, //uis_utfle_bom1 false, //uis_utf16le_bom2 false, //uis_utf32le_bom3 true, //uis_utf16le true, //uis_utf32le false, //uis_utf8_imp false, //uis_utf16le_imp false, //uis_utf32le_imp3 false, //uis_utf8_bom1 false, //uis_utf8_bom2 true, //uis_utf8 true, //uis_error }; static UtfIntroState s_introTransitions[][uictMax] = { // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8}, {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8}, {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8}, {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8}, {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be}, {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be}, {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8}, {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8}, {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le}, {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, }; static char s_introUngetCount[][uictMax] = { // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther {0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, {3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, {2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2}, {0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3}, {4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2}, {3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1}, }; inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { if (std::istream::traits_type::eof() == ch) { return uictOther; } switch (ch) { case 0: return uict00; case 0xBB: return uictBB; case 0xBF: return uictBF; case 0xEF: return uictEF; case 0xFE: return uictFE; case 0xFF: return uictFF; } if ((ch > 0) && (ch < 0xFF)) { return uictAscii; } return uictOther; } inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift) { const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits); const unsigned char mask = (0xFF >> (lead_bits + 1)); return static_cast(static_cast( header | ((ch >> rshift) & mask) )); } inline void QueueUnicodeCodepoint(std::deque& q, unsigned long ch) { // We are not allowed to queue the Stream::eof() codepoint, so // replace it with CP_REPLACEMENT_CHARACTER if (static_cast(Stream::eof()) == ch) { ch = CP_REPLACEMENT_CHARACTER; } if (ch < 0x80) { q.push_back(Utf8Adjust(ch, 0, 0)); } else if (ch < 0x800) { q.push_back(Utf8Adjust(ch, 2, 6)); q.push_back(Utf8Adjust(ch, 1, 0)); } else if (ch < 0x10000) { q.push_back(Utf8Adjust(ch, 3, 12)); q.push_back(Utf8Adjust(ch, 1, 6)); q.push_back(Utf8Adjust(ch, 1, 0)); } else { q.push_back(Utf8Adjust(ch, 4, 18)); q.push_back(Utf8Adjust(ch, 1, 12)); q.push_back(Utf8Adjust(ch, 1, 6)); q.push_back(Utf8Adjust(ch, 1, 0)); } } Stream::Stream(std::istream& input) : m_input(input), m_nPushedBack(0), m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0) { typedef std::istream::traits_type char_traits; if(!input) return; // Determine (or guess) the character-set by reading the BOM, if any. See // the YAML specification for the determination algorithm. char_traits::int_type intro[4]; int nIntroUsed = 0; UtfIntroState state = uis_start; for (; !s_introFinalState[state]; ) { std::istream::int_type ch = input.get(); intro[nIntroUsed++] = ch; UtfIntroCharType charType = IntroCharTypeOf(ch); UtfIntroState newState = s_introTransitions[state][charType]; int nUngets = s_introUngetCount[state][charType]; if (nUngets > 0) { for (; nUngets > 0; --nUngets) { if (char_traits::eof() != intro[--nIntroUsed]) { m_bufPushback[m_nPushedBack++] = char_traits::to_char_type(intro[nIntroUsed]); } } } state = newState; } switch (state) { case uis_utf8: m_charSet = utf8; break; case uis_utf16le: m_charSet = utf16le; break; case uis_utf16be: m_charSet = utf16be; break; case uis_utf32le: m_charSet = utf32le; break; case uis_utf32be: m_charSet = utf32be; break; default: m_charSet = utf8; break; } ReadAheadTo(0); } Stream::~Stream() { delete[] m_pPrefetched; } char Stream::peek() const { if (m_readahead.empty()) { return Stream::eof(); } return m_readahead[0]; } Stream::operator bool() const { return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof()); } // get // . Extracts a character from the stream and updates our position char Stream::get() { char ch = peek(); AdvanceCurrent(); m_mark.column++; if(ch == '\n') { m_mark.column = 0; m_mark.line++; } return ch; } // get // . Extracts 'n' characters from the stream and updates our position std::string Stream::get(int n) { std::string ret; ret.reserve(n); for(int i=0;i i; } void Stream::StreamInUtf8() const { unsigned char b = GetNextByte(); if (m_input.good()) { m_readahead.push_back(b); } } void Stream::StreamInUtf16() const { unsigned long ch = 0; unsigned char bytes[2]; int nBigEnd = (m_charSet == utf16be) ? 0 : 1; bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); if (!m_input.good()) { return; } ch = (static_cast(bytes[nBigEnd]) << 8) | static_cast(bytes[1 ^ nBigEnd]); if (ch >= 0xDC00 && ch < 0xE000) { // Trailing (low) surrogate...ugh, wrong order QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); return; } else if (ch >= 0xD800 && ch < 0xDC00) { // ch is a leading (high) surrogate // Four byte UTF-8 code point // Read the trailing (low) surrogate for (;;) { bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); if (!m_input.good()) { QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); return; } unsigned long chLow = (static_cast(bytes[nBigEnd]) << 8) | static_cast(bytes[1 ^ nBigEnd]); if (chLow < 0xDC00 || ch >= 0xE000) { // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); // Deal with the next UTF-16 unit if (chLow < 0xD800 || ch >= 0xE000) { // Easiest case: queue the codepoint and return QueueUnicodeCodepoint(m_readahead, ch); return; } else { // Start the loop over with the new high surrogate ch = chLow; continue; } } // Select the payload bits from the high surrogate ch &= 0x3FF; ch <<= 10; // Include bits from low surrogate ch |= (chLow & 0x3FF); // Add the surrogacy offset ch += 0x10000; } } QueueUnicodeCodepoint(m_readahead, ch); } inline char* ReadBuffer(unsigned char* pBuffer) { return reinterpret_cast(pBuffer); } unsigned char Stream::GetNextByte() const { if (m_nPushedBack) { return m_bufPushback[--m_nPushedBack]; } if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { std::streambuf *pBuf = m_input.rdbuf(); m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE); m_nPrefetchedUsed = 0; if (!m_nPrefetchedAvailable) { m_input.setstate(std::ios_base::eofbit); } if (0 == m_nPrefetchedAvailable) { return 0; } } return m_pPrefetched[m_nPrefetchedUsed++]; } void Stream::StreamInUtf32() const { static int indexes[2][4] = { {3, 2, 1, 0}, {0, 1, 2, 3} }; unsigned long ch = 0; unsigned char bytes[4]; int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); bytes[2] = GetNextByte(); bytes[3] = GetNextByte(); if (!m_input.good()) { return; } for (int i = 0; i < 4; ++i) { ch <<= 8; ch |= bytes[pIndexes[i]]; } QueueUnicodeCodepoint(m_readahead, ch); } }