php-sqlsrv/source/shared/globalization.h

//---------------------------------------------------------------------------------------------------------------------------------
// File: Globalization.h
//
// Contents: Contains functions for handling Windows format strings
//			 and UTF-16 on non-Windows platforms
//
// Microsoft Drivers 5.3 for PHP for SQL Server
// Copyright(c) Microsoft Corporation
// All rights reserved.
// MIT License
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the ""Software""),
//  to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
//  and / or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions :
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
//  IN THE SOFTWARE.
//---------------------------------------------------------------------------------------------------------------------------------

#if !defined(_GLOBALIZATION_H_)
#define _GLOBALIZATION_H_

#include "xplat.h"
#include "typedefs_for_linux.h"
#include <errno.h>

#include <iconv.h>

const iconv_t INVALID_ICONV = (iconv_t)(-1);

class IConvCache : public SLIST_ENTRY
{
    iconv_t m_iconv;

    // Prevent copying
    IConvCache( const IConvCache & );
    IConvCache & operator=( const IConvCache & );

public:
    IConvCache( int dstIdx, int srcIdx );
    ~IConvCache();

    iconv_t GetIConv() const
    {
        return m_iconv;
    }
};


class EncodingConverter
{
    UINT m_dstCodePage;
    UINT m_srcCodePage;
    const IConvCache * m_pCvtCache;

    bool IsValidIConv() const
    {
        return (NULL != m_pCvtCache && INVALID_ICONV != m_pCvtCache->GetIConv());
    }

    template< typename T >
    struct iconv_buffer
    {
        char * m_pBytes;
        size_t m_nBytesLeft;

        iconv_buffer( char * buffer, size_t cchSize )
            : m_pBytes(buffer), m_nBytesLeft(sizeof(T)*cchSize) {}
        ~iconv_buffer() {}

        void Reset( char * buffer, size_t cchSize )
        {
            m_pBytes = buffer;
            m_nBytesLeft = cchSize*sizeof(T);
        }

        void SkipSingleCh()
        {
            assert( sizeof(T) <= m_nBytesLeft );
            m_nBytesLeft -= sizeof(T);
            m_pBytes += sizeof(T);
        }
        void SkipDoubleCh()
        {
            SkipSingleCh();
            // Only skip second half if there's bytes left and it is non-NULL
            if ( m_nBytesLeft &&  0 != *(UNALIGNED T *)m_pBytes )
                SkipSingleCh();
        }
        void SkipUtf8Ch()
        {
            assert( 1 == sizeof(T) );
            const char * pNext = SystemLocale::NextChar( CP_UTF8, m_pBytes, m_nBytesLeft );
            assert( m_pBytes < pNext && (size_t)(pNext-m_pBytes) <= SystemLocale::MaxCharCchSize(CP_UTF8) );

            UINT toTrim = (UINT)(pNext - m_pBytes);
            assert( toTrim <= m_nBytesLeft );
            assert( 0 < toTrim );

            m_nBytesLeft -= toTrim;
            m_pBytes += toTrim;
        }

        static char DefaultChar( UINT srcDataCP )
        {
            return 0x3f;
        }
        static WCHAR DefaultWChar( UINT srcDataCP )
        {
            return (CP_UTF8 == srcDataCP ? 0xfffd   // Unicode to Unicode, use Unicode default char
                : (932 == srcDataCP ? 0x30fb        // 932 to Unicode has special default char
                    : 0x003f));                     // WCP source, use '?'
        }
        void AssignDefault( UINT srcDataCP )
        {
            assert( sizeof(T) <= m_nBytesLeft );
            if ( 1 == sizeof(T) )
            {
                *m_pBytes = DefaultChar( srcDataCP );
                --m_nBytesLeft;
                ++m_pBytes;
            }
            else
            {
                *(UNALIGNED T *)m_pBytes = DefaultWChar( srcDataCP );
                m_nBytesLeft -= sizeof(T);
                m_pBytes += sizeof(T);
            }
        }
        bool AssignDefaultUtf8( UINT srcDataCP )
        {
            // This is a utf8 buffer so T must be char
            assert( 1 == sizeof(T) );
            if ( CP_UTF16 == srcDataCP )
            {
                // If source codepage is UTF16 then use Unicode default char
                // UTF8 default char is 3 bytes long
                if ( m_nBytesLeft < 3 )
                    return false;

                *m_pBytes++ = (T)0xef;
                *m_pBytes++ = (T)0xbf;
                *m_pBytes++ = (T)0xbd;
                m_nBytesLeft -= 3;
            }
            else if ( 932 == srcDataCP )
            {
                // If source codepage is 932 then use special default char
                // UTF8 default char for 932 is 3 bytes long
                if ( m_nBytesLeft < 3 )
                    return false;

                *m_pBytes++ = (T)0xe3;
                *m_pBytes++ = (T)0x83;
                *m_pBytes++ = (T)0xbb;
                m_nBytesLeft -= 3;
            }
            else
            {
                *m_pBytes = DefaultChar( srcDataCP );
                ++m_pBytes;
                --m_nBytesLeft;
            }
            return true;
        }

        // Prevent compiler from generating these
        iconv_buffer();
        iconv_buffer( const iconv_buffer & other );
        iconv_buffer & operator=( const iconv_buffer & other );
    };

    template< class DestType >
    bool AddDefault( iconv_buffer<DestType> * dest, bool * pHasLoss, DWORD * pErrorCode ) const
    {
        if ( NULL != pHasLoss )
            *pHasLoss = true;

        if ( CP_UTF8 != m_dstCodePage )
            dest->AssignDefault( m_srcCodePage );
        else if ( !dest->AssignDefaultUtf8(m_srcCodePage) )
        {
            // Not enough room for the default char
            if ( NULL != pErrorCode )
                *pErrorCode = ERROR_INSUFFICIENT_BUFFER;
            return false;
        }
        return true;
    }

    template< class DestType, class SrcType >
    size_t Convert(
        iconv_buffer<DestType> & dest,
        iconv_buffer<SrcType> & src,
        bool failIfLossy = false, bool * pHasLoss = NULL, DWORD * pErrorCode = NULL ) const
    {
        if ( !IsValidIConv() )
            return 0;

        size_t iconv_ret;
        size_t cchDest = dest.m_nBytesLeft/sizeof(DestType);

        if ( NULL != pHasLoss )
            *pHasLoss = false;
        if ( NULL != pErrorCode )
            *pErrorCode = ERROR_SUCCESS;

        while ( 0 < dest.m_nBytesLeft && 0 < src.m_nBytesLeft )
        {
            // First clear any intermediate state left over from previous conversions
            iconv_ret = iconv( m_pCvtCache->GetIConv(), NULL, NULL, NULL, NULL );
            assert( 0 == iconv_ret );

            // Now attempt conversion
            iconv_ret = iconv( m_pCvtCache->GetIConv(), &src.m_pBytes, &src.m_nBytesLeft, &dest.m_pBytes, &dest.m_nBytesLeft );
            if ( iconv_ret == (size_t)(-1) )
            {
                // If there's no dest bytes left, then treat as E2BIG even if the error
                // is EILSEQ, etc.  We want E2BIG to take precedence like Windows.
                int err = (0 < dest.m_nBytesLeft ? errno : E2BIG);
                if ( E2BIG != err && failIfLossy )
                {
                    if ( NULL != pErrorCode )
                        *pErrorCode = ERROR_NO_UNICODE_TRANSLATION;
                    return 0;
                }

                switch ( err )
                {
                case EILSEQ: // Invalid multibyte sequence in input
                    if ( CP_UTF8 == m_srcCodePage )
                        src.SkipUtf8Ch();
                    else if ( 1 == sizeof(SrcType) )
                        src.SkipDoubleCh(); // DBCS
                    else
                        src.SkipSingleCh(); // utf32 or incomplate utf16 surrogate

                    if ( !AddDefault(&dest, pHasLoss, pErrorCode) )
                        return 0;

                    break;
                case EINVAL: // Incomplete multibyte sequence in input
                    if ( CP_UTF8 == m_srcCodePage )
                        src.SkipUtf8Ch();
                    else
                        src.SkipSingleCh();

                    if ( !AddDefault(&dest, pHasLoss, pErrorCode) )
                        return 0;

                    break;
                case E2BIG: // Output buffer is out of room
                    if ( NULL != pErrorCode )
                        *pErrorCode = ERROR_INSUFFICIENT_BUFFER;
                    return 0;
                default:
                    if ( NULL != pErrorCode )
                        *pErrorCode = ERROR_INVALID_PARAMETER;
                    return 0;
                }
            }
        }

        return cchDest - (dest.m_nBytesLeft / sizeof(DestType));
    }


public:
    EncodingConverter( UINT dstCodePage, UINT srcCodePage );
    ~EncodingConverter();

    bool Initialize();

    // Performs an encoding conversion.
    // Returns the number of dest chars written.
    // Input and output buffers should not overlap.
    template< class DestType, class SrcType, class AllocT >
    size_t Convert(
        DestType ** destBuffer,
        const SrcType * srcBuffer,  size_t cchSource,
        bool failIfLossy = false, bool * pHasLoss = NULL, DWORD * pErrorCode = NULL ) const
    {

        if ( !IsValidIConv() )
            return 0;

        iconv_buffer<SrcType> src(
            reinterpret_cast< char * >( const_cast< SrcType * >(srcBuffer) ),
            cchSource );

        size_t cchDest = cchSource;
        AutoArray< DestType, AllocT > newDestBuffer( cchDest );

        iconv_buffer<DestType> dest(
            reinterpret_cast< char * >(newDestBuffer.m_ptr),
            cchDest );

        size_t cchPrevCvt = 0;
        DWORD rcCvt;
        while ( true )
        {
            size_t cchCvt = Convert( dest, src, failIfLossy, pHasLoss, &rcCvt );
            if ( 0 == cchCvt )
            {
                if ( ERROR_INSUFFICIENT_BUFFER == rcCvt )
                {
                    // Alloc more and continue
                    cchPrevCvt = cchDest;
                    cchDest *= 2;
                    if ( !newDestBuffer.Realloc(cchDest) )
                    {
                        if ( NULL != pErrorCode )
                            *pErrorCode = ERROR_NOT_ENOUGH_MEMORY;
                        return 0;
                    }
                    // Fill newly allocated part of buffer
                    dest.Reset( reinterpret_cast< char * >(newDestBuffer.m_ptr+cchPrevCvt), cchDest );
                }
                else
                {
                    if ( NULL != pErrorCode )
                        *pErrorCode = rcCvt;
                    return 0;
                }
            }
            else
            {
                if ( NULL != pErrorCode )
                    *pErrorCode = rcCvt;
                *destBuffer = newDestBuffer.Detach();
                return cchPrevCvt + cchCvt;
            }
        }

    }
    // Performs an encoding conversion.
    // Returns the number of dest chars written.
    // Input and output buffers should not overlap.
    template< class DestType, class SrcType >
    size_t Convert(
        DestType * destBuffer,      size_t cchDest,
        const SrcType * srcBuffer,  size_t cchSource,
        bool failIfLossy = false, bool * pHasLoss = NULL, DWORD * pErrorCode = NULL ) const
    {

        if ( !IsValidIConv() )
            return 0;

        iconv_buffer<SrcType> src(
            reinterpret_cast< char * >( const_cast< SrcType * >(srcBuffer) ),
            cchSource );
        if ( 0 < cchDest )
        {
            iconv_buffer<DestType> dest(
                reinterpret_cast< char * >(destBuffer),
                cchDest );
            return Convert( dest, src, failIfLossy, pHasLoss, pErrorCode );
        }
        else
        {
            // Use fixed size buffer iteratively to determine final required length
            const size_t CCH_FIXED_SIZE = 256;
            char fixed_buf[ CCH_FIXED_SIZE*sizeof(DestType) ] = {'\0'};
            iconv_buffer<DestType> dest(
                &fixed_buf[0],
                CCH_FIXED_SIZE );

            bool hasLoss = false;
            DWORD rcCvt = ERROR_SUCCESS;
            size_t cchOnce = 0;
            size_t cchCumulative = 0;

            while ( 0 < src.m_nBytesLeft
                && 0 == (cchOnce = Convert(dest, src, failIfLossy, &hasLoss, &rcCvt))
                && ERROR_INSUFFICIENT_BUFFER == rcCvt )
            {
                cchCumulative += CCH_FIXED_SIZE;
                cchCumulative -= dest.m_nBytesLeft;
                dest.Reset( &fixed_buf[0], CCH_FIXED_SIZE );
            }
            if ( 0 < cchOnce )
                cchCumulative += cchOnce;
            if ( NULL != pErrorCode )
                *pErrorCode = (0 < cchCumulative ? ERROR_SUCCESS : rcCvt);
            if ( NULL != pHasLoss )
                *pHasLoss |= hasLoss;
            return cchCumulative;
        }

    }
};

#endif // _GLOBALIZATION_H_