Tokenizer.cs source code in C# .NET

Source code for the .NET framework in C#

                        

Code:

/ 4.0 / 4.0 / untmp / DEVDIV_TFS / Dev10 / Releases / RTMRel / ndp / clr / src / BCL / System / Security / Util / Tokenizer.cs / 1305376 / Tokenizer.cs

                            // ==++== 
//
//   Copyright (c) Microsoft Corporation.  All rights reserved.
//
// ==--== 
/*============================================================
** 
** CLASS:    Tokenizer.cs 
**
** [....] 
**
**
** PURPOSE:  Tokenize "Elementary XML", that is, XML without
**           attributes or DTDs, in other words, XML with 
**           elements only.
** 
** 
===========================================================*/
namespace System.Security.Util 
{
    using System.Text;
    using System;
    using System.IO; 
    using System.Globalization;
    using System.Diagnostics.Contracts; 
 
    internal sealed class Tokenizer
    { 
        // There are five externally knowable token types: bras, kets,
        // slashes, cstrs, and equals.

        internal const byte bra = 0; 
        internal const byte ket = 1;
        internal const byte slash = 2; 
        internal const byte cstr = 3; 
        internal const byte equals = 4;
        internal const byte quest = 5; 
        internal const byte bang = 6;
        internal const byte dash = 7;

        // these are the assorted text characters that the tokenizer must 
        // understand to do its job
 
        internal const int intOpenBracket = (int) '<'; 
        internal const int intCloseBracket = (int) '>';
        internal const int intSlash = (int) '/'; 
        internal const int intEquals = (int) '=';
        internal const int intQuote = (int) '\"';
        internal const int intQuest = (int) '?';
        internal const int intBang = (int) '!'; 
        internal const int intDash = (int) '-';
        internal const int intTab = (int) '\t'; 
        internal const int intCR = (int) '\r'; 
        internal const int intLF = (int) '\n';
        internal const int intSpace = (int) ' '; 

        // this tells us where we will be getting our input from
        // and what the encoding is (if any)
 
        private enum TokenSource
        { 
            UnicodeByteArray,  // this is little-endian unicode (there are other encodings) 
            UTF8ByteArray,
            ASCIIByteArray, 
            CharArray,
            String,
            NestedStrings,
            Other 
        }
 
        // byte streams can come in 3 different flavors 

        internal enum ByteTokenEncoding 
        {
            UnicodeTokens,  // this is little-endian unicode (there are other encodings)
            UTF8Tokens,
            ByteTokens 
        }
 
        public int  LineNo; 

        // these variables represent the input state of the of the tokenizer 

        private int _inProcessingTag;
        private byte[] _inBytes;
        private char[] _inChars; 
        private String _inString;
        private int _inIndex; 
        private int _inSize; 
        private int _inSavedCharacter;
        private TokenSource _inTokenSource; 
        private ITokenReader _inTokenReader;

        // these variables are used to build and deliver tokenizer output strings
 
        private StringMaker _maker = null;
 
        private String[] _searchStrings; 
        private String[] _replaceStrings;
 
        private int _inNestedIndex;
        private int _inNestedSize;
        private String _inNestedString;
 

        //=============================================================== 
        // Constructor uses given ICharInputStream 
        //
 
        internal void BasicInitialization()
        {
            LineNo = 1 ;
            _inProcessingTag = 0; 
            _inSavedCharacter = -1;
            _inIndex = 0; 
            _inSize = 0; 
            _inNestedSize = 0;
            _inNestedIndex = 0; 
            _inTokenSource = TokenSource.Other;
            _maker = System.SharedStatics.GetSharedStringMaker();
        }
 
        public void Recycle()
        { 
            System.SharedStatics.ReleaseSharedStringMaker(ref _maker); // will set _maker to null 
        }
 
        internal Tokenizer (String input)
        {
            BasicInitialization();
            _inString = input; 
            _inSize = input.Length;
            _inTokenSource = TokenSource.String; 
        } 

        internal Tokenizer (String input, String[] searchStrings, String[] replaceStrings) 
        {
            BasicInitialization();
            _inString = input;
            _inSize = _inString.Length; 
            _inTokenSource = TokenSource.NestedStrings;
            _searchStrings = searchStrings; 
            _replaceStrings = replaceStrings; 

#if DEBUG 
            Contract.Assert(searchStrings.Length == replaceStrings.Length, "different number of search/replace strings");
            Contract.Assert(searchStrings.Length != 0, "no search replace strings, shouldn't be using this ctor");

            for (int istr=0; istr= 3 , "XML Slug too small");
                Contract.Assert( str[0] == '{', "XML Slug doesn't start with '{'" ); 
                Contract.Assert( str[str.Length-1] == '}', "XML Slug doesn't end with '}'" );

                str = replaceStrings[istr];
                Contract.Assert( str != null, "XML Replacement null"); 
                Contract.Assert( str.Length >= 1, "XML Replacement empty");
            } 
#endif 
        }
 
        internal Tokenizer (byte[] array, ByteTokenEncoding encoding, int startIndex)
           {
            BasicInitialization();
            _inBytes = array; 
            _inSize = array.Length;
            _inIndex = startIndex; 
 
            switch (encoding)
            { 
            case ByteTokenEncoding.UnicodeTokens:
                _inTokenSource = TokenSource.UnicodeByteArray;
                break;
 
            case ByteTokenEncoding.UTF8Tokens:
                _inTokenSource = TokenSource.UTF8ByteArray; 
                break; 

            case ByteTokenEncoding.ByteTokens: 
                _inTokenSource = TokenSource.ASCIIByteArray;
                break;

            default: 
                throw new ArgumentException(Environment.GetResourceString("Arg_EnumIllegalVal", (int)encoding));
            } 
           } 

        internal Tokenizer (char[] array) 
        {
            BasicInitialization();
            _inChars = array;
            _inSize = array.Length; 
            _inTokenSource = TokenSource.CharArray;
        } 
 
        internal Tokenizer (StreamReader input)
        { 
            BasicInitialization();
            _inTokenReader = new StreamTokenReader(input);
        }
 
        internal void ChangeFormat( System.Text.Encoding encoding )
        { 
            if (encoding == null) 
            {
                return; 
            }

            Contract.Assert( _inSavedCharacter == -1, "There was a lookahead character at the stream change point, that means the parser is changing encodings too late" );
 
            switch (_inTokenSource)
            { 
                case TokenSource.UnicodeByteArray: 
                case TokenSource.UTF8ByteArray:
                case TokenSource.ASCIIByteArray: 
                    // these are the ones we can change on the fly

                    if (encoding == System.Text.Encoding.Unicode)
                    { 
                        _inTokenSource = TokenSource.UnicodeByteArray;
                        return; 
                    } 

                    if (encoding == System.Text.Encoding.UTF8) 
                    {
                        _inTokenSource = TokenSource.UTF8ByteArray;
                        return;
                    } 
#if FEATURE_ASCII
                    if (encoding == System.Text.Encoding.ASCII) 
                    { 
                        _inTokenSource = TokenSource.ASCIIByteArray;
                        return; 
                    }
#endif
                    break;
 
                case TokenSource.String:
                case TokenSource.CharArray: 
                case TokenSource.NestedStrings: 
                    // these are already unicode and encoding changes are moot
                    // they can't be further decoded 
                    return;
            }

            // if we're here it means we don't know how to change 
            // to the desired encoding with the memory that we have
            // we'll have to do this the hard way -- that means 
            // creating a suitable stream from what we've got 

            // this is thankfully the rare case as UTF8 and unicode 
            // dominate the scene

            Stream stream = null;
 
            switch (_inTokenSource)
            { 
                case TokenSource.UnicodeByteArray: 
                case TokenSource.UTF8ByteArray:
                case TokenSource.ASCIIByteArray: 
                    stream = new MemoryStream(_inBytes, _inIndex, _inSize - _inIndex);
                    break;

                case TokenSource.CharArray: 
                case TokenSource.String:
                case TokenSource.NestedStrings: 
                    Contract.Assert(false, "attempting to change encoding on a non-changable source, should have been prevented earlier" ); 
                    return;
 
                default:
                    StreamTokenReader reader = _inTokenReader as StreamTokenReader;

                    if (reader == null) 
                    {
                        Contract.Assert(false, "A new input source type has been added to the Tokenizer but it doesn't support encoding changes"); 
                        return; 
                    }
 
                    stream = reader._in.BaseStream;

                    Contract.Assert( reader._in.CurrentEncoding != null, "Tokenizer's StreamReader does not have an encoding" );
 
                    String fakeReadString = new String(' ', reader.NumCharEncountered);
                    stream.Position = reader._in.CurrentEncoding.GetByteCount( fakeReadString ); 
                    break; 
            }
 
            Contract.Assert(stream != null, "The XML stream with new encoding was not properly initialized for kind of input we had");

            // we now have an initialized memory stream based on whatever source we had before
            _inTokenReader = new StreamTokenReader( new StreamReader( stream, encoding ) ); 
            _inTokenSource = TokenSource.Other;
        } 
 
        internal void GetTokens( TokenizerStream stream, int maxNum, bool endAfterKet )
        { 
            while (maxNum == -1 || stream.GetTokenCount() < maxNum)
            {
                int i = -1;
                byte ch; 
                int cb = 0;
                bool inLiteral = false; 
                bool inQuotedString = false; 

                StringMaker m = _maker; 

                m._outStringBuilder = null;
                m._outIndex = 0;
 
            BEGINNING:
 
                if (_inSavedCharacter != -1) 
                {
                       i = _inSavedCharacter; 
                       _inSavedCharacter = -1;
                }
                else switch (_inTokenSource)
                { 
                case TokenSource.UnicodeByteArray:
                    if (_inIndex + 1 >= _inSize) 
                    { 
                        stream.AddToken( -1 );
                        return; 
                    }

                     i = (int)((_inBytes[_inIndex+1]<<8) + _inBytes[_inIndex]);
                      _inIndex += 2; 
                    break;
 
                 case TokenSource.UTF8ByteArray: 
                    if (_inIndex >= _inSize)
                    { 
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inBytes[_inIndex++]);
 
                    // single byte -- case, early out as we're done already 
                    if ((i & 0x80) == 0x00)
                        break; 

                    // to decode the lead byte switch on the high nibble
                    // shifted down so the switch gets dense integers
                    switch ((i & 0xf0) >>4) 
                    {
                    case 0x8:   // 1000  (together these 4 make 10xxxxx) 
                    case 0x9:   // 1001 
                    case 0xa:   // 1010
                    case 0xb:   // 1011 
                        // trail byte is an error
                        throw new XmlSyntaxException( LineNo );

                    case 0xc:   // 1100  (these two make 110xxxxx) 
                    case 0xd:   // 1101
                        // two byte encoding (1 trail byte) 
                        i &= 0x1f; 
                        cb = 2;
                        break; 

                    case 0xe:   // 1110 (this gets us 1110xxxx)
                        // three byte encoding (2 trail bytes)
                        i &= 0x0f; 
                        cb = 3;
                        break; 
 
                    case 0xf:   // 1111 (and finally 1111xxxx)
                        // 4 byte encoding is an error 
                        throw new XmlSyntaxException( LineNo );
                    }

                    // at least one trail byte, fetch it 
                    if (_inIndex >= _inSize)
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" )); 
 
                    ch = _inBytes[_inIndex++];
 
                    // must be trail byte encoding
                    if ((ch & 0xc0) != 0x80)
                        throw new XmlSyntaxException( LineNo );
 
                    i = (i<<6) | (ch & 0x3f);
 
                    // done now if 2 byte encoding, otherwise go for 3 
                    if (cb == 2)
                        break; 

                    if (_inIndex >= _inSize)
                        throw new XmlSyntaxException (LineNo, Environment.GetResourceString( "XMLSyntax_UnexpectedEndOfFile" ));
 
                    ch = _inBytes[_inIndex++];
 
                    // must be trail byte encoding 
                    if ((ch & 0xc0) != 0x80)
                        throw new XmlSyntaxException( LineNo ); 

                    i = (i<<6) | (ch & 0x3f);
                    break;
 
                case TokenSource.ASCIIByteArray:
                    if (_inIndex >= _inSize) 
                    { 
                        stream.AddToken( -1 );
                        return; 
                    }

                    i = (int)(_inBytes[_inIndex++]);
                    break; 

                case TokenSource.CharArray: 
                    if (_inIndex >= _inSize) 
                    {
                        stream.AddToken( -1 ); 
                        return;
                    }

                     i = (int)(_inChars[_inIndex++]); 
                    break;
 
                case TokenSource.String: 
                    if (_inIndex >= _inSize)
                    { 
                        stream.AddToken( -1 );
                        return;
                    }
 
                    i = (int)(_inString[_inIndex++]);
                    break; 
 
                case TokenSource.NestedStrings:
                    if (_inNestedSize != 0) 
                    {
                        if (_inNestedIndex < _inNestedSize)
                        {
                            i = _inNestedString[_inNestedIndex++]; 
                            break;
                        } 
 
                        _inNestedSize = 0;
                    } 

                    if (_inIndex >= _inSize)
                    {
                        stream.AddToken( -1 ); 
                        return;
                    } 
 
                    i = (int)(_inString[_inIndex++]);
 
                    if (i != '{')
                        break;

                    for (int istr=0; istr < _searchStrings.Length; istr++) 
                    {
                        if (0==String.Compare(_searchStrings[istr], 0, _inString, _inIndex-1, _searchStrings[istr].Length, StringComparison.Ordinal)) 
                        { 
                            _inNestedString = _replaceStrings[istr];
                            _inNestedSize = _inNestedString.Length; 
                            _inNestedIndex = 1;
                            i = _inNestedString[0];
                            _inIndex += _searchStrings[istr].Length - 1;
                            break; 
                        }
                    } 
 
                    break;
 
                default:
                    i = _inTokenReader.Read();
                    if (i == -1)
                    { 
                        stream.AddToken( -1 );
                        return; 
                    } 
                    break;
                } 

                if (!inLiteral)
                {
                    switch (i) 
                    {
                    // skip whitespace 
                    case intSpace: 
                    case intTab:
                    case intCR: 
                        goto BEGINNING;

                    // count linefeeds
                    case intLF: 
                        LineNo++;
                        goto BEGINNING; 
 
                    case intOpenBracket:
                        _inProcessingTag++; 
                        stream.AddToken( bra );
                        continue;

                    case intCloseBracket: 
                        _inProcessingTag--;
                        stream.AddToken( ket ); 
                        if (endAfterKet) 
                            return;
                        continue; 

                    case intEquals:
                        stream.AddToken( equals );
                        continue; 

                    case intSlash: 
                        if (_inProcessingTag != 0) 
                        {
                            stream.AddToken( slash ); 
                            continue;
                        }
                        break;
 
                    case intQuest:
                        if (_inProcessingTag != 0) 
                        { 
                            stream.AddToken( quest );
                            continue; 
                        }
                        break;

                    case intBang: 
                        if (_inProcessingTag != 0)
                        { 
                            stream.AddToken( bang ); 
                            continue;
                        } 
                        break;

                    case intDash:
                        if (_inProcessingTag != 0) 
                        {
                            stream.AddToken( dash ); 
                            continue; 
                        }
                        break; 

                    case intQuote:
                        inLiteral = true;
                        inQuotedString = true; 
                        goto BEGINNING;
                    } 
                } 
                else
                   { 
                    switch (i)
                    {
                    case intOpenBracket:
                        if (!inQuotedString) 
                        {
                            _inSavedCharacter = i; 
                            stream.AddToken( cstr ); 
                            stream.AddString( this.GetStringToken() );
                            continue; 
                        }
                        break;

                    case intCloseBracket: 
                    case intEquals:
                    case intSlash: 
                        if (!inQuotedString && _inProcessingTag != 0) 
                        {
                            _inSavedCharacter = i; 
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        } 
                        break;
 
                    case intQuote: 
                        if (inQuotedString)
                        { 
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() );
                            continue;
                        } 
                        break;
 
                    case intTab: 
                    case intCR:
                    case intSpace: 
                        if (!inQuotedString)
                        {
                            stream.AddToken( cstr );
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        } 
                        break; 

                    // count linefeeds 
                    case intLF:
                        LineNo++;

                        if (!inQuotedString) 
                        {
                            stream.AddToken( cstr ); 
                            stream.AddString( this.GetStringToken() ); 
                            continue;
                        } 
                        break;
                    }
                }
 
                inLiteral = true;
 
                // add character  to the string 
                if (m._outIndex < StringMaker.outMaxSize)
                { 
                    // easy case
                    m._outChars[m._outIndex++] = (char)i;
                }
                else 
                {
                    if (m._outStringBuilder == null) 
                    { 
                       // OK, first check if we have to init the StringBuilder
                        m._outStringBuilder = new StringBuilder(); 
                    }

                    // OK, copy from _outChars to _outStringBuilder
                    m._outStringBuilder.Append(m._outChars, 0, StringMaker.outMaxSize); 

                    // reset _outChars pointer 
                    m._outChars[0] = (char)i; 
                    m._outIndex = 1;
                } 

                goto BEGINNING;
            }
        } 

        [Serializable] 
        internal sealed class StringMaker 
        {
            String[] aStrings; 
            uint cStringsMax;
            uint cStringsUsed;

            public StringBuilder _outStringBuilder; 
            public char[] _outChars;
            public int _outIndex; 
 
            public const int outMaxSize = 512;
 
            static uint HashString(String str)
            {
                uint hash = 0;
 
                int l = str.Length;
 
                // rotate in string character 
                for (int i=0; i < l; i++)
                { 
                    hash = (hash << 3) ^ (uint)str[i] ^ (hash >> 29);
                }

                return hash; 
            }
 
            static uint HashCharArray(char[] a, int l) 
            {
                uint hash = 0; 

                // rotate in a character
                for (int i=0; i < l; i++)
                { 
                    hash = (hash << 3) ^ (uint)a[i] ^ (hash >> 29);
                } 
 
                return hash;
            } 

            public StringMaker()
            {
                cStringsMax = 2048; 
                cStringsUsed = 0;
                aStrings = new String[cStringsMax]; 
                _outChars = new char[outMaxSize]; 
            }
 
            bool CompareStringAndChars(String str, char [] a, int l)
            {
                if (str.Length != l)
                    return false; 

                for (int i=0; i (cStringsMax / 4) * 3)
                {
                    // we need to rehash
 
                    uint cNewMax = cStringsMax * 2;
                    String [] aStringsNew = new String[cNewMax]; 
 
                    for (int i=0; i < cStringsMax;i++)
                    { 
                        if (aStrings[i] != null)
                        {
                            hash = HashString(aStrings[i]) % cNewMax;
 
                            while (aStringsNew[hash] != null)
                            { 
                                // slot full, skip 
                                if (++hash >= cNewMax)
                                    hash = 0; 
                            }

                            aStringsNew[hash] = aStrings[i];
                        } 
                    }
 
                    // all done, cutover to the new hash table 
                    cStringsMax = cNewMax;
                    aStrings = aStringsNew; 
                }

                hash = HashCharArray(a, l) % cStringsMax;
 
                String str;
 
                while ((str = aStrings[hash]) != null) 
                {
                    if (CompareStringAndChars(str, a, l)) 
                        return str;

                    if (++hash >= cStringsMax)
                        hash = 0; 
                }
 
                str = new String(a,0,l); 
                aStrings[hash] = str;
                cStringsUsed++; 

                return str;
            }
        } 

        //=============================================================== 
        // 
        //
 
        private String GetStringToken ()
        {
            return _maker.MakeString();
        } 

        internal interface ITokenReader 
        { 
            int Read();
        } 

        internal class StreamTokenReader : ITokenReader
        {
 
            internal StreamReader _in;
            internal int _numCharRead; 
 
            internal StreamTokenReader(StreamReader input)
            { 
                _in = input;
                _numCharRead = 0;
            }
 
            public virtual int Read()
            { 
                int value = _in.Read(); 
                if (value != -1)
                    _numCharRead++; 
                return value;
            }

            internal int NumCharEncountered 
            {
                get 
                { 
                    return _numCharRead;
                } 
            }
        }
    }
 
    internal sealed class TokenizerShortBlock
    { 
        internal short[] m_block = new short[16]; 
        internal TokenizerShortBlock m_next = null;
    } 

    internal sealed class TokenizerStringBlock
    {
        internal String[] m_block = new String[16]; 
        internal TokenizerStringBlock m_next = null;
    } 
 

    internal sealed class TokenizerStream 
    {
        private int m_countTokens;

        private TokenizerShortBlock m_headTokens; 
        private TokenizerShortBlock m_lastTokens;
        private TokenizerShortBlock m_currentTokens; 
        private int m_indexTokens; 

        private TokenizerStringBlock m_headStrings; 
        private TokenizerStringBlock m_currentStrings;
        private int m_indexStrings;

#if _DEBUG 
        private bool m_bLastWasCStr;
#endif 
 
        internal TokenizerStream()
        { 
            m_countTokens = 0;
            m_headTokens = new TokenizerShortBlock();
            m_headStrings = new TokenizerStringBlock();
            Reset(); 
        }
 
        internal void AddToken( short token ) 
        {
            if (m_currentTokens.m_block.Length <= m_indexTokens) 
            {
                m_currentTokens.m_next = new TokenizerShortBlock();
                m_currentTokens = m_currentTokens.m_next;
                m_indexTokens = 0; 
            }
 
            m_countTokens++; 
            m_currentTokens.m_block[m_indexTokens++] = token;
        } 

        internal void AddString( String str )
        {
            if (m_currentStrings.m_block.Length <= m_indexStrings) 
            {
                m_currentStrings.m_next = new TokenizerStringBlock(); 
                m_currentStrings = m_currentStrings.m_next; 
                m_indexStrings = 0;
            } 

            m_currentStrings.m_block[m_indexStrings++] = str;
        }
 
        internal void Reset()
        { 
            m_lastTokens = null; 
            m_currentTokens = m_headTokens;
            m_currentStrings = m_headStrings; 
            m_indexTokens = 0;
            m_indexStrings = 0;
#if _DEBUG
            m_bLastWasCStr = false; 
#endif
        } 
 
        internal short GetNextFullToken()
        { 
            if (m_currentTokens.m_block.Length <= m_indexTokens)
            {
                m_lastTokens = m_currentTokens;
                m_currentTokens = m_currentTokens.m_next; 
                m_indexTokens = 0;
            } 
 
            return m_currentTokens.m_block[m_indexTokens++];
        } 

        internal short GetNextToken()
        {
            short retval = (short)(GetNextFullToken() & 0x00FF); 
#if _DEBUG
            Contract.Assert( !m_bLastWasCStr, "CStr token not followed by GetNextString()" ); 
            m_bLastWasCStr = (retval == Tokenizer.cstr); 
#endif
            return retval; 
        }

        internal String GetNextString()
        { 
            if (m_currentStrings.m_block.Length <= m_indexStrings)
            { 
                m_currentStrings = m_currentStrings.m_next; 
                m_indexStrings = 0;
            } 
#if _DEBUG
            m_bLastWasCStr = false;
#endif
            return m_currentStrings.m_block[m_indexStrings++]; 
        }
 
        internal void ThrowAwayNextString() 
        {
            GetNextString(); 
        }

        internal void TagLastToken( short tag )
        { 
            if (m_indexTokens == 0)
                m_lastTokens.m_block[m_lastTokens.m_block.Length-1] = (short)((ushort)m_lastTokens.m_block[m_lastTokens.m_block.Length-1] | (ushort)tag); 
            else 
                m_currentTokens.m_block[m_indexTokens-1] = (short)((ushort)m_currentTokens.m_block[m_indexTokens-1] | (ushort)tag);
        } 

        internal int GetTokenCount()
        {
            return m_countTokens; 
        }
 
        internal void GoToPosition( int position ) 
        {
            Reset(); 

            for (int count = 0; count < position; ++count)
            {
                if (GetNextToken() == Tokenizer.cstr) 
                    ThrowAwayNextString();
            } 
        } 
    }
} 

// File provided for Reference Use Only by Microsoft Corporation (c) 2007.
                        

Link Menu

Network programming in C#, Network Programming in VB.NET, Network Programming in .NET
This book is available now!
Buy at Amazon US or
Buy at Amazon UK