diff options
Diffstat (limited to 'lib/picotok.c')
-rw-r--r-- | lib/picotok.c | 1563 |
1 files changed, 1563 insertions, 0 deletions
diff --git a/lib/picotok.c b/lib/picotok.c new file mode 100644 index 0000000..4488609 --- /dev/null +++ b/lib/picotok.c @@ -0,0 +1,1563 @@ +/* + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file picotok.c + * + * tokenizer + * + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * All rights reserved. + * + * History: + * - 2009-04-20 -- initial version + * + */ + + +/* ************************************************************/ +/* tokenisation and markup handling */ +/* ************************************************************/ + +/** @addtogroup picotok + @b tokenisation_overview + + markup handling overview: + + The following markups are recognized + - ignore + - speed + - pitch + - volume + - voice + - preproccontext + - mark + - play + - usesig + - genfile + - sentence + - s + - paragraph + - p + - break + - spell (pauses between letter) + - phoneme + + All markups which are recognized but are not yet implemented in pico + system have the mark. +*/ + + +#include "picodefs.h" +#include "picoos.h" +#include "picobase.h" +#include "picodbg.h" +#include "picodata.h" +#include "picotok.h" +#include "picoktab.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} +#endif + +/* *****************************************************************************/ + +#define IN_BUF_SIZE 255 +#define OUT_BUF_SIZE IN_BUF_SIZE + 3 * PICODATA_ITEM_HEADSIZE + 3 + +#define MARKUP_STRING_BUF_SIZE (IN_BUF_SIZE*5) +#define MAX_NR_MARKUP_PARAMS 6 +#define MARKUP_HANDLING_DISABLED 0 +#define MARKUP_HANDLING_ENABLED 1 +#define EOL '\n' + + +typedef picoos_int8 pico_tokenSubType; +typedef picoos_uint8 pico_tokenType; + +/** @todo : consider adding these specialized exception codes: */ + +#define PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE PICO_ERR_OTHER +#define PICO_ERR_INVALID_MARKUP_TAG PICO_ERR_OTHER +#define PICO_ERR_INTERNAL_LIMIT PICO_ERR_OTHER + +typedef enum {MIDummyStart, MIIgnore, + MIPitch, MISpeed, MIVolume, + MIVoice, MIPreprocContext, MIMarker, + MIPlay, MIUseSig, MIGenFile, MIParagraph, + MISentence, MIBreak, MISpell, MIPhoneme, MIItem, MISpeaker, MIDummyEnd + } MarkupId; +typedef enum {MSNotInMarkup, MSGotStart, MSExpectingmarkupTagName, MSInmarkupTagName, + MSGotmarkupTagName, MSInAttrName, MSGotAttrName, MSGotEqual, MSInAttrValue, + MSInAttrValueEscaped, MSGotAttrValue, MSGotEndSlash, MSGotEnd, + MSError, MSErrorTooLong, MSErrorSyntax + } MarkupState; +typedef enum {MENone, MEMissingStart, MEUnknownTag, MEIdent, MEMissingEqual, + MEMissingQuote, MEMissingEnd, MEUnexpectedChar, MEInterprete + } MarkupParseError; + +typedef enum {MTNone, MTStart, MTEnd, MTEmpty} MarkupTagType; + +#define UTF_CHAR_COMPLETE 2 +#define UTF_CHAR_INCOMPLETE 1 +#define UTF_CHAR_MALFORMED 0 + +#define TOK_MARKUP_KW_IGNORE (picoos_uchar*)"ignore" +#define TOK_MARKUP_KW_SPEED (picoos_uchar*)"speed" +#define TOK_MARKUP_KW_PITCH (picoos_uchar*)"pitch" +#define TOK_MARKUP_KW_VOLUME (picoos_uchar*)"volume" +#define TOK_MARKUP_KW_VOICE (picoos_uchar*)"voice" +#define TOK_MARKUP_KW_CONTEXT (picoos_uchar*)"preproccontext" +#define TOK_MARKUP_KW_MARK (picoos_uchar*)"mark" +#define TOK_MARKUP_KW_PLAY (picoos_uchar*)"play" +#define TOK_MARKUP_KW_USESIG (picoos_uchar*)"usesig" +#define TOK_MARKUP_KW_GENFILE (picoos_uchar*)"genfile" +#define TOK_MARKUP_KW_SENTENCE (picoos_uchar*)"sentence" +#define TOK_MARKUP_KW_S (picoos_uchar*)"s" +#define TOK_MARKUP_KW_PARAGRAPH (picoos_uchar*)"paragraph" +#define TOK_MARKUP_KW_P (picoos_uchar*)"p" +#define TOK_MARKUP_KW_BREAK (picoos_uchar*)"break" +#define TOK_MARKUP_KW_SPELL (picoos_uchar*)"spell" +#define TOK_MARKUP_KW_PHONEME (picoos_uchar*)"phoneme" +#define TOK_MARKUP_KW_ITEM (picoos_uchar*)"item" +#define TOK_MARKUP_KW_SPEAKER (picoos_uchar*)"speaker" + +#define KWLevel (picoos_uchar *)"level" +#define KWName (picoos_uchar *)"name" +#define KWProsDomain (picoos_uchar *)"prosodydomain" +#define KWTime (picoos_uchar *)"time" +#define KWMode (picoos_uchar *)"mode" +#define KWSB (picoos_uchar *)"sb" +#define KWPB (picoos_uchar *)"pb" +#define KWFile (picoos_uchar *)"file" +#define KWType (picoos_uchar *)"type" +#define KWF0Beg (picoos_uchar *)"f0beg" +#define KWF0End (picoos_uchar *)"f0end" +#define KWXFadeBeg (picoos_uchar *)"xfadebeg" +#define KWXFadeEnd (picoos_uchar *)"xfadeend" +#define KWAlphabet (picoos_uchar *)"alphabet" +#define KWPH (picoos_uchar *)"ph" +#define KWOrthMode (picoos_uchar *)"orthmode" +#define KWIgnorePunct (picoos_uchar *)"ignorepunct" +#define KWInfo1 (picoos_uchar *)"info1" +#define KWInfo2 (picoos_uchar *)"info2" +#define KWDATA (picoos_uchar *)"data" + +#define PICO_SPEED_MIN 20 +#define PICO_SPEED_MAX 500 +#define PICO_SPEED_DEFAULT 100 +#define PICO_SPEED_FACTOR_MIN 500 +#define PICO_SPEED_FACTOR_MAX 2000 + +#define PICO_PITCH_MIN 50 +#define PICO_PITCH_MAX 200 +#define PICO_PITCH_DEFAULT 100 +#define PICO_PITCH_FACTOR_MIN 500 +#define PICO_PITCH_FACTOR_MAX 2000 +#define PICO_PITCH_ADD_MIN -100 +#define PICO_PITCH_ADD_MAX 100 +#define PICO_PITCH_ADD_DEFAULT 0 + +#define PICO_VOLUME_MIN 0 +#define PICO_VOLUME_MAX 500 +#define PICO_VOLUME_DEFAULT 100 +#define PICO_VOLUME_FACTOR_MIN 500 +#define PICO_VOLUME_FACTOR_MAX 2000 + +#define PICO_SPEAKER_MIN 20 +#define PICO_SPEAKER_MAX 180 +#define PICO_SPEAKER_DEFAULT 100 +#define PICO_SPEAKER_FACTOR_MIN 500 +#define PICO_SPEAKER_FACTOR_MAX 2000 + +#define PICO_CONTEXT_DEFAULT (picoos_uchar*)"DEFAULT" + +#define PARAGRAPH_PAUSE_DUR 500 +#define SPELL_WITH_PHRASE_BREAK 1 +#define SPELL_WITH_SENTENCE_BREAK 2 + +/* *****************************************************************************/ + +#define TOK_PUNC_FLUSH (picoos_char) '\0' + +typedef picoos_uchar Word[MARKUP_STRING_BUF_SIZE]; + + +struct MarkupParam { + Word paramId; + Word paramVal; +}; + +typedef struct MarkupParam MarkupParams[MAX_NR_MARKUP_PARAMS]; + + +/** subobject : TokenizeUnit + * shortcut : tok + */ +typedef struct tok_subobj +{ + picoos_int32 ignLevel; + + picoos_uchar utf[5]; + picoos_int32 utfpos; + picoos_int32 utflen; + + MarkupParams markupParams; + picoos_int32 nrMarkupParams; + MarkupState markupState; + picoos_uchar markupStr[MARKUP_STRING_BUF_SIZE]; + picoos_int32 markupPos; + picoos_int32 markupLevel[MIDummyEnd+1]; + picoos_uchar markupTagName[IN_BUF_SIZE]; + MarkupTagType markupTagType; + MarkupParseError markupTagErr; + + picoos_int32 strPos; + picoos_uchar strDelim; + picoos_bool isFileAttr; + + pico_tokenType tokenType; + pico_tokenSubType tokenSubType; + + picoos_int32 tokenPos; + picoos_uchar tokenStr[IN_BUF_SIZE]; + + picoos_int32 nrEOL; + + picoos_bool markupHandlingMode; /* to be moved ??? */ + picoos_bool aborted; /* to be moved ??? */ + + picoos_bool start; + + picoos_uint8 outBuf[OUT_BUF_SIZE]; /* internal output buffer */ + picoos_uint16 outReadPos; /* next pos to read from outBuf */ + picoos_uint16 outWritePos; /* next pos to write to outBuf */ + + picoos_uchar saveFile[IN_BUF_SIZE]; + Word phonemes; + + picotrns_SimpleTransducer transducer; + + /* kbs */ + + picoktab_Graphs graphTab; + picokfst_FST xsampa_parser; + picokfst_FST svoxpa_parser; + picokfst_FST xsampa2svoxpa_mapper; + + + +} tok_subobj_t; + +/* *****************************************************************************/ + +static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok); +static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling); +static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok); +static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[]); +static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok); +static MarkupId tok_markupTagId (picoos_uchar tagId[]); + +/* *****************************************************************************/ + +static picoos_bool tok_strEqual(picoos_uchar * str1, picoos_uchar * str2) +{ + return (picoos_strcmp((picoos_char*)str1, (picoos_char*)str2) == 0); +} + +static void tok_reduceBlanks(picoos_uchar * str) + /* Remove leading and trailing blanks of 'str' and reduce + groups of blanks within string to exactly one blank. */ + +{ + int i = 0; + int j = 0; + + while (str[j] != 0) { + if (str[j] == (picoos_uchar)' ') { + /* note one blank except at the beginning of string */ + if (i > 0) { + str[i] = (picoos_uchar)' '; + i++; + } + j++; + while (str[j] == (picoos_uchar)' ') { + j++; + } + } else { + str[i] = str[j]; + j++; + i++; + } + } + + /* remove blanks at end of string */ + if ((i > 0) && (str[i - 1] == ' ')) { + i--; + } + str[i] = 0; +} + + +static void tok_startIgnore (tok_subobj_t * tok) +{ + tok->ignLevel++; +} + + +static void tok_endIgnore (tok_subobj_t * tok) +{ + if (tok->ignLevel > 0) { + tok->ignLevel--; + } +} + + +static void tok_getParamIntVal (MarkupParams params, picoos_uchar paramId[], picoos_int32 * paramVal, picoos_bool * paramFound) +{ + int i=0; + + while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId,params[i].paramId)) { + i++; + } + if ((i < MAX_NR_MARKUP_PARAMS)) { + (*paramVal) = picoos_atoi((picoos_char*)params[i].paramVal); + (*paramFound) = TRUE; + } else { + (*paramVal) = -1; + (*paramFound) = FALSE; + } +} + + + +static void tok_getParamStrVal (MarkupParams params, picoos_uchar paramId[], picoos_uchar paramStrVal[], picoos_bool * paramFound) +{ + int i=0; + + while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId,params[i].paramId)) { + i++; + } + if (i < MAX_NR_MARKUP_PARAMS) { + picoos_strcpy((picoos_char*)paramStrVal, (picoos_char*)params[i].paramVal); + (*paramFound) = TRUE; + } else { + paramStrVal[0] = 0; + (*paramFound) = FALSE; + } +} + + +static void tok_getParamPhonesStr (MarkupParams params, picoos_uchar paramId[], picoos_uchar alphabet[], picoos_uchar phones[], picoos_int32 phoneslen, picoos_bool * paramFound) +{ + + int i; + picoos_bool done; + + i = 0; + while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId, params[i].paramId)) { + i++; + } + if (i < MAX_NR_MARKUP_PARAMS) { + if (tok_strEqual(alphabet, PICODATA_XSAMPA) || tok_strEqual(alphabet, (picoos_uchar*)"")) { + picoos_strlcpy((picoos_char*)phones, (picoos_char*)params[i].paramVal, phoneslen); + done = TRUE; + } else { + done = FALSE; + } + (*paramFound) = TRUE; + } else { + done = FALSE; + (*paramFound) = FALSE; + } + if (!done) { + phones[0] = 0; + } +} + + +static void tok_clearMarkupParams (MarkupParams params) +{ + int i; + + for (i = 0; i<MAX_NR_MARKUP_PARAMS; i++) { + params[i].paramId[0] = 0; + params[i].paramVal[0] = 0; + } +} + + +static void tok_getDur (picoos_uchar durStr[], picoos_uint32 * dur, picoos_bool * done) +{ + + int num=0; + int i=0; + picoos_uchar tmpWord[IN_BUF_SIZE]; + + picoos_strlcpy((picoos_char*)tmpWord, (picoos_char*)durStr, sizeof(tmpWord)); + tok_reduceBlanks(tmpWord); + while ((durStr[i] >= '0') && (durStr[i] <= '9')) { + num = 10 * num + (int)durStr[i] - (int)'0'; + tmpWord[i] = ' '; + i++; + } + tok_reduceBlanks(tmpWord); + if (tok_strEqual(tmpWord, (picoos_uchar*)"s")) { + (*dur) = (1000 * num); + (*done) = TRUE; + } else if (tok_strEqual(tmpWord,(picoos_uchar*)"ms")) { + (*dur) = num; + (*done) = TRUE; + } else { + (*dur) = 0; + (*done) = FALSE; + } +} + + +static picoos_int32 tok_putToUtf (tok_subobj_t * tok, picoos_uchar ch) +{ + if (tok->utfpos < PICOBASE_UTF8_MAXLEN) { + tok->utf[tok->utfpos] = ch; + if (tok->utfpos == 0) { + tok->utflen = picobase_det_utf8_length(ch); + } else if (((ch < (picoos_uchar)'\200') || (ch >= (picoos_uchar)'\300'))) { + tok->utflen = 0; + } + (tok->utfpos)++; + if ((tok->utfpos == tok->utflen)) { + if ((tok->utfpos < PICOBASE_UTF8_MAXLEN)) { + tok->utf[tok->utfpos] = 0; + } + return UTF_CHAR_COMPLETE; + } else if (tok->utfpos < tok->utflen) { + return UTF_CHAR_INCOMPLETE; + } else { + return UTF_CHAR_MALFORMED; + } + } else { + return UTF_CHAR_MALFORMED; + } +} + + +static picoos_bool tok_isRelative (picoos_uchar strval[], picoos_uint32 * val) +{ + picoos_int32 len; + picoos_bool rel; + + rel = FALSE; + len = picoos_strlen((picoos_char*)strval); + if (len > 0) { + if (strval[len - 1] == '%') { + strval[len - 1] = 0; + if ((strval[0] == '+') || (strval[0] == '-')) { + (*val) = 1000 + (picoos_atoi((picoos_char*)strval) * 10); + } else { + (*val) = picoos_atoi((picoos_char*)strval) * 10; + } + rel = TRUE; + } + } + return rel; +} + + +static void tok_putItem (picodata_ProcessingUnit this, tok_subobj_t * tok, + picoos_uint8 itemType, picoos_uint8 info1, picoos_uint8 info2, + picoos_uint16 val, + picoos_uchar str[]) +{ + picoos_int32 len, i; + + if ((itemType == PICODATA_ITEM_CMD) && (info1 == PICODATA_ITEMINFO1_CMD_FLUSH)) { + tok->outBuf[tok->outWritePos++] = itemType; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = 0; + } + else if (tok->ignLevel <= 0) { + switch (itemType) { + case PICODATA_ITEM_CMD: + switch (info1) { + case PICODATA_ITEMINFO1_CMD_CONTEXT: + case PICODATA_ITEMINFO1_CMD_VOICE: + case PICODATA_ITEMINFO1_CMD_MARKER: + case PICODATA_ITEMINFO1_CMD_PLAY: + case PICODATA_ITEMINFO1_CMD_SAVE: + case PICODATA_ITEMINFO1_CMD_UNSAVE: + case PICODATA_ITEMINFO1_CMD_PROSDOMAIN: + case PICODATA_ITEMINFO1_CMD_PHONEME: + len = picoos_strlen((picoos_char*)str); + if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) { + tok->outBuf[tok->outWritePos++] = itemType; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = len; + for (i=0; i<len; i++) { + tok->outBuf[tok->outWritePos++] = str[i]; + } + } + else { + PICODBG_WARN(("tok_putItem: output buffer too small")); + } + break; + case PICODATA_ITEMINFO1_CMD_IGNSIG: + case PICODATA_ITEMINFO1_CMD_IGNORE: + if (tok->outWritePos + 4 < OUT_BUF_SIZE) { + tok->outBuf[tok->outWritePos++] = itemType; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = 0; + } + else { + PICODBG_WARN(("tok_putItem: output buffer too small")); + } + break; + case PICODATA_ITEMINFO1_CMD_SPEED: + case PICODATA_ITEMINFO1_CMD_PITCH: + case PICODATA_ITEMINFO1_CMD_VOLUME: + case PICODATA_ITEMINFO1_CMD_SPELL: + case PICODATA_ITEMINFO1_CMD_SIL: + case PICODATA_ITEMINFO1_CMD_SPEAKER: + if (tok->outWritePos + 4 + 2 < OUT_BUF_SIZE) { + tok->outBuf[tok->outWritePos++] = itemType; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = 2; + tok->outBuf[tok->outWritePos++] = val % 256; + tok->outBuf[tok->outWritePos++] = val / 256; + } + else { + PICODBG_WARN(("tok_putItem: output buffer too small")); + } + break; + default: + PICODBG_WARN(("tok_putItem: unknown command type")); + } + break; + case PICODATA_ITEM_TOKEN: + len = picoos_strlen((picoos_char*)str); + if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) { + tok->outBuf[tok->outWritePos++] = itemType; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = len; + for (i=0; i<len; i++) { + tok->outBuf[tok->outWritePos++] = str[i]; + } + } + else { + PICODBG_WARN(("tok_putItem: output buffer too small")); + } + break; + default: + PICODBG_WARN(("tok_putItem: unknown item type")); + } + } +} + + +static void tok_putItem2 (picodata_ProcessingUnit this, tok_subobj_t * tok, + picoos_uint8 type, + picoos_uint8 info1, picoos_uint8 info2, + picoos_uint8 len, + picoos_uint8 data[]) +{ + picoos_int32 i; + + if (is_valid_itemtype(type)) { + tok->outBuf[tok->outWritePos++] = type; + tok->outBuf[tok->outWritePos++] = info1; + tok->outBuf[tok->outWritePos++] = info2; + tok->outBuf[tok->outWritePos++] = len; + for (i=0; i<len; i++) { + tok->outBuf[tok->outWritePos++] = data[i]; + } + } +} + + +static MarkupId tok_markupTagId (picoos_uchar tagId[]) +{ + if (picoos_strstr(tagId,(picoos_char *)"svox:") == (picoos_char *)tagId) { + tagId+=5; + } + if (tok_strEqual(tagId, TOK_MARKUP_KW_IGNORE)) { + return MIIgnore; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEED)) { + return MISpeed; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PITCH)) { + return MIPitch; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOLUME)) { + return MIVolume; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEAKER)) { + return MISpeaker; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOICE)) { + return MIVoice; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_CONTEXT)) { + return MIPreprocContext; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_MARK)) { + return MIMarker; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PLAY)) { + return MIPlay; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_USESIG)) { + return MIUseSig; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_GENFILE)) { + return MIGenFile; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SENTENCE) || tok_strEqual(tagId, TOK_MARKUP_KW_S)) { + return MISentence; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PARAGRAPH) || tok_strEqual(tagId, TOK_MARKUP_KW_P)) { + return MIParagraph; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_BREAK)) { + return MIBreak; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPELL)) { + return MISpell; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PHONEME)) { + return MIPhoneme; + } else if (tok_strEqual(tagId, TOK_MARKUP_KW_ITEM)) { + return MIItem; + } else { + return MIDummyEnd; + } +} + + +extern void tok_checkLimits (picodata_ProcessingUnit this, picoos_uint32 * value, picoos_uint32 min, picoos_uint32 max, picoos_uchar valueType[]) +{ + if ((((*value) < min) || ((*value) > max))) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %i for %s", *value, valueType); + if (((*value) < min)) { + (*value) = min; + } else if (((*value) > max)) { + (*value) = max; + } + } +} + + + +extern void tok_checkRealLimits (picodata_ProcessingUnit this, picoos_single * value, picoos_single min, picoos_single max, picoos_uchar valueType[]) +{ + if ((((*value) < min) || ((*value) > max))) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %f for %s", *value, valueType); + if (((*value) < min)) { + (*value) = min; + } else if (((*value) > max)) { + (*value) = max; + } + } +} + + +#define VAL_STR_LEN 21 + +static void tok_interpretMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_bool isStartTag, MarkupId mId) +{ + picoos_bool done; + picoos_int32 ival; + picoos_uint32 uval; + picoos_int32 ival2; + picoos_uchar valStr[VAL_STR_LEN]; + picoos_uchar valStr2[VAL_STR_LEN]; + picoos_uchar valStr3[VAL_STR_LEN]; + picoos_int32 i2; + picoos_uint32 dur; + picoos_bool done1; + picoos_bool paramFound; + picoos_uint8 type, info1, info2; + picoos_uint8 data[256]; + picoos_int32 pos, n, len; + picoos_uchar part[10]; + + done = FALSE; + switch (mId) { + case MIIgnore: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) { + tok_startIgnore(tok); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_endIgnore(tok); + done = TRUE; + } + break; + case MISpeed: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) { + if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) { + tok_checkLimits(this, & uval, PICO_SPEED_FACTOR_MIN, PICO_SPEED_FACTOR_MAX,(picoos_uchar*)"relative speed factor"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)""); + } else { + uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal); + tok_checkLimits(this, & uval, PICO_SPEED_MIN, PICO_SPEED_MAX,(picoos_uchar*)"speed"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)""); + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEED_DEFAULT, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIPitch: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) { + if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) { + tok_checkLimits(this, & uval,PICO_PITCH_FACTOR_MIN,PICO_PITCH_FACTOR_MAX, (picoos_uchar*)"relative pitch factor"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)""); + } else { + uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal); + tok_checkLimits(this, & uval,PICO_PITCH_MIN,PICO_PITCH_MAX, (picoos_uchar*)"pitch"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)""); + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_PITCH_DEFAULT, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIVolume: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) { + if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) { + tok_checkLimits(this, & uval, PICO_VOLUME_FACTOR_MIN, PICO_VOLUME_FACTOR_MAX, (picoos_uchar*)"relative volume factor"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)""); + } else { + uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal); + tok_checkLimits(this, & uval, PICO_VOLUME_MIN, PICO_VOLUME_MAX, (picoos_uchar*)"volume"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)""); + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_VOLUME_DEFAULT, (picoos_uchar*)""); + done = TRUE; + } + break; + case MISpeaker: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) { + if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) { + tok_checkLimits(this, & uval, PICO_SPEAKER_FACTOR_MIN, PICO_SPEAKER_FACTOR_MAX, (picoos_uchar*)"relative speaker factor"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)""); + } else { + uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal); + tok_checkLimits(this, & uval, PICO_SPEAKER_MIN, PICO_SPEAKER_MAX, (picoos_uchar*)"volume"); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)""); + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEAKER_DEFAULT, (picoos_uchar*)""); + done = TRUE; + } + break; + + case MIVoice: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)""); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIPreprocContext: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, PICO_CONTEXT_DEFAULT); + done = TRUE; + } + break; + case MIMarker: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_MARKER, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) { + done = TRUE; + } + break; + case MISentence: + if (isStartTag) { + tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, valStr); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIParagraph: + if (isStartTag) { + tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, valStr); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, PARAGRAPH_PAUSE_DUR, (picoos_uchar*)""); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIBreak: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWTime)) { + tok_getDur(tok->markupParams[0].paramVal, & dur, & done1); + tok_checkLimits (this, &dur, 0, 65535, (picoos_uchar*)"time"); + if (done1) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, dur, (picoos_uchar*)""); + done = TRUE; + } + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + done = TRUE; + } + break; + case MISpell: + if (isStartTag) { + if (tok_strEqual(tok->markupParams[0].paramId, KWMode)) { + if (tok_strEqual(tok->markupParams[0].paramVal, KWPB)) { + uval = SPELL_WITH_PHRASE_BREAK; + } else if (tok_strEqual(tok->markupParams[0].paramVal, KWSB)) { + uval = SPELL_WITH_SENTENCE_BREAK; + } else { + tok_getDur(tok->markupParams[0].paramVal, & uval, & done1); + tok_checkLimits (this, & uval, 0, 65535, (picoos_uchar*)"time"); + if (done1) { + done = TRUE; + } + } + } else { + uval = SPELL_WITH_PHRASE_BREAK; + } + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_START, uval, (picoos_uchar*)""); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIGenFile: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) { + if (tok->saveFile[0] != 0) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE, + picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, tok->saveFile); + tok->saveFile[0] = 0; + } + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SAVE, + picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/FALSE), 0, tok->markupParams[0].paramVal); + picoos_strcpy((picoos_char*)tok->saveFile, (picoos_char*)tok->markupParams[0].paramVal); + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + if (tok->saveFile[0] != 0) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE, + picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, (picoos_uchar*)""); + tok->saveFile[0] = 0; + } + done = TRUE; + } + break; + case MIPlay: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) { + if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) { + tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound); + tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound); + tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3,& paramFound); + tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound); + tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY, + picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal); + tok_startIgnore(tok); + } else { + if (tok->ignLevel > 0) { + tok_startIgnore(tok); + } else { + picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead\n", tok->markupParams[0].paramVal); + } + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_endIgnore(tok); + done = TRUE; + } + break; + case MIUseSig: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) { + if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) { + tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound); + tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound); + tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3, & paramFound); + tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound); + tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY, + picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_START, 0, (picoos_uchar*)""); + } else { + if (tok->ignLevel <= 0) { + picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead", tok->markupParams[0].paramVal); + } + } + done = TRUE; + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIPhoneme: + i2 = 0; + if (isStartTag) { + if (tok_strEqual(tok->markupParams[0].paramId, KWAlphabet) && tok_strEqual(tok->markupParams[1].paramId, KWPH)) { + if (tok_strEqual(tok->markupParams[2].paramId, KWOrthMode) + && tok_strEqual(tok->markupParams[2].paramVal, KWIgnorePunct)) { + i2 = 1; + } + if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[1].paramVal, tok->markupParams[0].paramVal, tok->phonemes, sizeof(tok->phonemes)-1) == PICO_OK) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME, + PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes); + done = TRUE; + } else { + PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal)); + picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal); + done = TRUE; + } + } else if (tok_strEqual(tok->markupParams[0].paramId, KWPH)) { + if (tok_strEqual(tok->markupParams[1].paramId, KWOrthMode) + && tok_strEqual(tok->markupParams[1].paramVal, KWIgnorePunct)) { + i2 = 1; + } + if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[0].paramVal, PICODATA_XSAMPA, tok->phonemes, sizeof(tok->phonemes)) == PICO_OK) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME, + PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes); + done = TRUE; + } + else { + PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal)); + picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizing text instead", tok->markupParams[0].paramVal); + done = TRUE; + } + } + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) { + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME, + PICODATA_ITEMINFO2_CMD_END, i2, (picoos_uchar*)""); + done = TRUE; + } + break; + case MIItem: + if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWType) && + tok_strEqual(tok->markupParams[1].paramId, KWInfo1)&& + tok_strEqual(tok->markupParams[2].paramId, KWInfo2)&& + tok_strEqual(tok->markupParams[3].paramId, KWDATA)) { + picoos_int32 len2, n2; + type = picoos_atoi(tok->markupParams[0].paramVal); + info1 = picoos_atoi(tok->markupParams[1].paramVal); + info2 = picoos_atoi(tok->markupParams[2].paramVal); + n = 0; n2 = 0; + len2 = (picoos_int32)picoos_strlen(tok->markupParams[3].paramVal); + while (n<len2) { + while ((tok->markupParams[3].paramVal[n] != 0) && (tok->markupParams[3].paramVal[n] <= 32)) { + n++; + } + tok->markupParams[3].paramVal[n2] = tok->markupParams[3].paramVal[n]; + n++; + n2++; + } + if (is_valid_itemtype(type)) { + done = TRUE; + len = 0; + pos = 0; + picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal), + &pos, ',', part, 10, &done1); + while (done && done1) { + n = picoos_atoi(part); + if ((n>=0) && (n<256) && (len<256)) { + data[len++] = n; + } + else { + done = FALSE; + } + picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal), + &pos, ',', part, 10, &done1); + } + if (done) { + tok_putItem2(this, tok, type, info1, info2, len, data); + } + } + else { + done = FALSE; + } + } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) { + done = TRUE; + } + break; + default: + break; + } + if (!done) { + tok->markupTagErr = MEInterprete; + } + if (isStartTag) { + tok->markupLevel[mId]++; + } else if ((tok->markupLevel[mId] > 0)) { + tok->markupLevel[mId]--; + } +} + + +static picoos_bool tok_attrChar (picoos_uchar ch, picoos_bool first) +{ + return ((((ch >= (picoos_uchar)'A') && (ch <= (picoos_uchar)'Z')) || + ((ch >= (picoos_uchar)'a') && (ch <= (picoos_uchar)'z'))) || + ( !(first) && ((ch >= (picoos_uchar)'0') && (ch <= (picoos_uchar)'9')))); +} + + + +static picoos_bool tok_idChar (picoos_uchar ch, picoos_bool first) +{ + return tok_attrChar(ch, first) || ( !(first) && (ch == (picoos_uchar)':')); +} + + +static void tok_setIsFileAttr (picoos_uchar name[], picoos_bool * isFile) +{ + (*isFile) = tok_strEqual(name, KWFile); +} + +/* *****************************************************************************/ + +static void tok_putToSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[], pico_tokenType type, pico_tokenSubType subtype) +{ + int i, len; + + if (str[0] != 0) { + len = picoos_strlen((picoos_char*)str); + for (i = 0; i < len; i++) { + if (tok->tokenPos >= IN_BUF_SIZE) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT, (picoos_char*)"", (picoos_char*)"simple token too long; forced treatment"); + tok_treatSimpleToken(this, tok); + } + tok->tokenStr[tok->tokenPos] = str[i]; + tok->tokenPos++; + } + } + tok->tokenType = type; + tok->tokenSubType = subtype; +} + + +static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[]) +{ + picoos_int32 i, len; + picoos_uint8 ok; + + tok->markupTagErr = MENone; + len = picoos_strlen((picoos_char*)str); + for (i = 0; i< len; i++) { + if (tok->markupPos >= (MARKUP_STRING_BUF_SIZE - 1)) { + if ((tok->markupPos == (MARKUP_STRING_BUF_SIZE - 1)) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"markup tag too long"); + } + tok->markupState = MSErrorTooLong; + } else if ((str[i] == (picoos_uchar)' ') && ((tok->markupState == MSExpectingmarkupTagName) || (tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSGotAttrName) || (tok->markupState == MSGotEqual) || (tok->markupState == MSGotAttrValue))) { + } else if ((str[i] == (picoos_uchar)'>') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) { + tok->markupState = MSGotEnd; + } else if ((str[i] == (picoos_uchar)'/') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) { + if (tok->markupTagType == MTEnd) { + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + } else { + tok->markupTagType = MTEmpty; + tok->markupState = MSGotEndSlash; + } + } else { + switch (tok->markupState) { + case MSNotInMarkup: + if (str[i] == (picoos_uchar)'<') { + tok_clearMarkupParams(tok->markupParams); + tok->nrMarkupParams = 0; + tok->strPos = 0; + tok->markupTagType = MTStart; + tok->markupState = MSGotStart; + } else { + tok->markupTagErr = MEMissingStart; + tok->markupState = MSError; + } + break; + case MSGotStart: + if (str[i] == (picoos_uchar)'/') { + tok->markupTagType = MTEnd; + tok->markupState = MSExpectingmarkupTagName; + } else if (str[i] == (picoos_uchar)' ') { + tok->markupState = MSExpectingmarkupTagName; + } else if (tok_idChar(str[i],TRUE)) { + tok->markupTagType = MTStart; + tok->markupTagName[tok->strPos] = str[i]; + tok->strPos++; + tok->markupTagName[tok->strPos] = 0; + tok->markupState = MSInmarkupTagName; + } else { + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + } + break; + case MSInmarkupTagName: case MSExpectingmarkupTagName: + if (tok_idChar(str[i],tok->markupState == MSExpectingmarkupTagName)) { + tok->markupTagName[tok->strPos] = str[i]; + tok->strPos++; + tok->markupTagName[(tok->strPos)] = 0; + tok->markupState = MSInmarkupTagName; + } else if ((tok->markupState == MSInmarkupTagName) && (str[i] == (picoos_uchar)' ')) { + tok->markupState = MSGotmarkupTagName; + picobase_lowercase_utf8_str(tok->markupTagName, (picoos_char*)tok->markupTagName, IN_BUF_SIZE, &ok); + tok->strPos = 0; + } else { + tok->markupTagErr = MEIdent; + tok->markupState = MSError; + } + break; + case MSGotmarkupTagName: case MSGotAttrValue: + if (tok_attrChar(str[i], TRUE)) { + if (tok->markupTagType == MTEnd) { + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + } else { + if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) { + tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i]; + tok->strPos++; + tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0; + } else { + picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"too many attributes in markup; ignoring"); + } + tok->markupState = MSInAttrName; + } + } else { + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + } + break; + case MSInAttrName: + if (tok_attrChar(str[i], FALSE)) { + if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) { + tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i]; + tok->strPos++; + tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0; + } + tok->markupState = MSInAttrName; + } else if (str[i] == (picoos_uchar)' ') { + picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok); + tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr); + tok->markupState = MSGotAttrName; + } else if (str[i] == (picoos_uchar)'=') { + picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok); + tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr); + tok->markupState = MSGotEqual; + } else { + tok->markupTagErr = MEMissingEqual; + tok->markupState = MSError; + } + break; + case MSGotAttrName: + if (str[i] == (picoos_uchar)'=') { + tok->markupState = MSGotEqual; + } else { + tok->markupTagErr = MEMissingEqual; + tok->markupState = MSError; + } + break; + case MSGotEqual: + if ((str[i] == (picoos_uchar)'"') || (str[i] == (picoos_uchar)'\'')) { + tok->strDelim = str[i]; + tok->strPos = 0; + tok->markupState = MSInAttrValue; + } else { + tok->markupTagErr = MEMissingQuote; + tok->markupState = MSError; + } + break; + case MSInAttrValue: + if (!(tok->isFileAttr) && (str[i] == (picoos_uchar)'\\')) { + tok->markupState = MSInAttrValueEscaped; + } else if (str[i] == tok->strDelim) { + if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) { + tok->nrMarkupParams++; + } + tok->strPos = 0; + tok->markupState = MSGotAttrValue; + } else { + if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) { + tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i]; + tok->strPos++; + tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0; + } + tok->markupState = MSInAttrValue; + } + break; + case MSInAttrValueEscaped: + if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) { + tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i]; + tok->strPos++; + tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0; + } + tok->markupState = MSInAttrValue; + break; + case MSGotEndSlash: + if (str[i] == (picoos_uchar)'>') { + tok->markupState = MSGotEnd; + } else { + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + } + break; + default: + tok->markupTagErr = MEUnexpectedChar; + tok->markupState = MSError; + break; + } + } + tok->markupStr[tok->markupPos] = str[i]; + tok->markupPos++; + tok->markupStr[tok->markupPos] = 0; + } + /* + PICODBG_DEBUG(("putToMarkup %s", tok->markupStr)); + */ +} + +/* *****************************************************************************/ + +static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok) +{ + picoos_int32 i; + + tok->utfpos = 0; + tok->utflen = 0; + tok->markupState = MSNotInMarkup; + for (i = 0; i < tok->markupPos; i++) { + tok_treatChar(this, tok, tok->markupStr[i], FALSE); + } + tok->markupPos = 0; + tok->strPos = 0; +} + + +static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok) +{ + MarkupId mId; + + if (tok_markupTagId(tok->markupTagName) != MIDummyEnd) { + if (tok->markupTagErr == MENone) { + tok->markupState = MSNotInMarkup; + if ((tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_SPACE) && (tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED)) { + tok_treatSimpleToken(this, tok); + } + tok_putToSimpleToken(this, tok, (picoos_uchar*)" ", PICODATA_ITEMINFO1_TOKTYPE_SPACE, -1); + mId = tok_markupTagId(tok->markupTagName); + if ((tok->markupTagType == MTStart) || (tok->markupTagType == MTEmpty)) { + tok_interpretMarkup(this, tok, TRUE, mId); + } + if (((tok->markupTagType == MTEnd) || (tok->markupTagType == MTEmpty))) { + tok_clearMarkupParams(tok->markupParams); + tok->nrMarkupParams = 0; + tok_interpretMarkup(this, tok, FALSE,mId); + } + } + if (tok->markupTagErr != MENone) { + if (!tok->aborted) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"syntax error in markup token '%s'",tok->markupStr); + } + tok_treatMarkupAsSimpleToken(this, tok); + } + } else { + tok_treatMarkupAsSimpleToken(this, tok); + } + tok->markupState = MSNotInMarkup; + tok->markupPos = 0; + tok->strPos = 0; +} + + + +static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling) +{ + picoos_int32 id; + picoos_uint8 uval8; + pico_tokenType type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED; + pico_tokenSubType subtype = -1; + picoos_bool dummy; + + if (ch == NULLC) { + tok_treatSimpleToken(this, tok); + tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)""); + } + else { + switch (tok_putToUtf(tok, ch)) { + case UTF_CHAR_MALFORMED: + tok->utfpos = 0; + tok->utflen = 0; + break; + case UTF_CHAR_INCOMPLETE: + break; + case UTF_CHAR_COMPLETE: + markupHandling = (markupHandling && (tok->markupHandlingMode == MARKUP_HANDLING_ENABLED)); + id = picoktab_graphOffset(tok->graphTab, tok->utf); + if (id > 0) { + if (picoktab_getIntPropTokenType(tok->graphTab, id, &uval8)) { + type = (pico_tokenType)uval8; + if (type == PICODATA_ITEMINFO1_TOKTYPE_LETTERV) { + type = PICODATA_ITEMINFO1_TOKTYPE_LETTER; + } + } + dummy = picoktab_getIntPropTokenSubType(tok->graphTab, id, &subtype); + } else if (ch <= (picoos_uchar)' ') { + type = PICODATA_ITEMINFO1_TOKTYPE_SPACE; + subtype = -1; + } else { + type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED; + subtype = -1; + } + if ((ch > (picoos_uchar)' ')) { + tok->nrEOL = 0; + } else if ((ch == EOL)) { + tok->nrEOL++; + } + if (markupHandling && (tok->markupState != MSNotInMarkup)) { + tok_putToMarkup(this, tok, tok->utf); + if (tok->markupState >= MSError) { + tok_treatMarkupAsSimpleToken(this, tok); + } else if (tok->markupState == MSGotEnd) { + tok_treatMarkup(this, tok); + } + } else if ((markupHandling && (ch == (picoos_uchar)'<'))) { + tok_putToMarkup(this, tok, tok->utf); + } else if (type != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED) { + if ((type != tok->tokenType) || (type == PICODATA_ITEMINFO1_TOKTYPE_CHAR) || (subtype != tok->tokenSubType)) { + tok_treatSimpleToken(this, tok); + } else if ((ch == EOL) && (tok->nrEOL == 2)) { + tok_treatSimpleToken(this, tok); + tok_putToSimpleToken(this, tok, (picoos_uchar*)".", PICODATA_ITEMINFO1_TOKTYPE_CHAR, -1); + tok_treatSimpleToken(this, tok); + } + tok_putToSimpleToken(this, tok, tok->utf, type, subtype); + } else { + tok_treatSimpleToken(this, tok); + } + tok->utfpos = 0; + tok->utflen = 0; + break; + } + } +} + + +static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok) +{ + if (tok->tokenPos < IN_BUF_SIZE) { + tok->tokenStr[tok->tokenPos] = 0; + } + if (tok->markupState != MSNotInMarkup) { + if (!(tok->aborted) && (tok->markupState >= MSGotmarkupTagName) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) { + picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"unfinished markup tag '%s'",tok->markupStr); + } + tok_treatMarkupAsSimpleToken(this, tok); + tok_treatSimpleToken(this, tok); + } else if ((tok->tokenPos > 0) && ((tok->ignLevel <= 0) || (tok->tokenType == PICODATA_ITEMINFO1_TOKTYPE_SPACE))) { + tok_putItem(this, tok, PICODATA_ITEM_TOKEN, tok->tokenType, (picoos_uint8)tok->tokenSubType, 0, tok->tokenStr); + } + tok->tokenPos = 0; + tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED; + tok->tokenSubType = -1; +} + +/* *****************************************************************************/ + +static pico_status_t tokReset(register picodata_ProcessingUnit this) +{ + tok_subobj_t * tok; + MarkupId mId; + + if (NULL == this || NULL == this->subObj) { + return PICO_ERR_OTHER; + } + tok = (tok_subobj_t *) this->subObj; + + tok->ignLevel = 0; + + tok->utfpos = 0; + tok->utflen = 0; + + tok_clearMarkupParams(tok->markupParams); + tok->nrMarkupParams = 0; + tok->markupState = MSNotInMarkup; + tok->markupPos = 0; + for (mId = MIDummyStart; mId <= MIDummyEnd; mId++) { + tok->markupLevel[mId] = 0; + } + tok->markupTagName[0] = 0; + tok->markupTagType = MTNone; + tok->markupTagErr = MENone; + + tok->strPos = 0; + tok->strDelim = 0; + tok->isFileAttr = FALSE; + + tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED; + tok->tokenSubType = -1; + tok->tokenPos = 0; + + tok->nrEOL = 0; + + + tok->markupHandlingMode = TRUE; + tok->aborted = FALSE; + + tok->start = TRUE; + + tok->outReadPos = 0; + tok->outWritePos = 0; + + tok->saveFile[0] = 0; + + + tok->graphTab = picoktab_getGraphs(this->voice->kbArray[PICOKNOW_KBID_TAB_GRAPHS]); + + tok->xsampa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA_PARSE]); + PICODBG_TRACE(("got xsampa_parser @ %i",tok->xsampa_parser)); + + tok->svoxpa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_SVOXPA_PARSE]); + PICODBG_TRACE(("got svoxpa_parser @ %i",tok->svoxpa_parser)); + + tok->xsampa2svoxpa_mapper = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA2SVOXPA]); + PICODBG_TRACE(("got xsampa2svoxpa_mapper @ %i",tok->xsampa2svoxpa_mapper)); + + + + return PICO_OK; +} + +static pico_status_t tokInitialize(register picodata_ProcessingUnit this) +{ +/* + + tok_subobj_t * tok; + + if (NULL == this || NULL == this->subObj) { + return PICO_ERR_OTHER; + } + tok = (tok_subobj_t *) this->subObj; +*/ + return tokReset(this); +} + + +static pico_status_t tokTerminate(register picodata_ProcessingUnit this) +{ + return PICO_OK; +} + +static picodata_step_result_t tokStep(register picodata_ProcessingUnit this, picoos_int16 mode, picoos_uint16 * numBytesOutput); + +static pico_status_t tokSubObjDeallocate(register picodata_ProcessingUnit this, + picoos_MemoryManager mm) +{ + + if (NULL != this) { + picoos_deallocate(this->common->mm, (void *) &this->subObj); + } + mm = mm; /* avoid warning "var not used in this function"*/ + return PICO_OK; +} + +picodata_ProcessingUnit picotok_newTokenizeUnit(picoos_MemoryManager mm, picoos_Common common, + picodata_CharBuffer cbIn, picodata_CharBuffer cbOut, + picorsrc_Voice voice) +{ + tok_subobj_t * tok; + picodata_ProcessingUnit this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice); + if (this == NULL) { + return NULL; + } + this->initialize = tokInitialize; + PICODBG_DEBUG(("set this->step to tokStep")); + this->step = tokStep; + this->terminate = tokTerminate; + this->subDeallocate = tokSubObjDeallocate; + this->subObj = picoos_allocate(mm, sizeof(tok_subobj_t)); + if (this->subObj == NULL) { + picoos_deallocate(mm, (void *)&this); + return NULL; + } + tok = (tok_subobj_t *) this->subObj; + tok->transducer = picotrns_newSimpleTransducer(mm, common, 10*(PICOTRNS_MAX_NUM_POSSYM+2)); + if (NULL == tok->transducer) { + tokSubObjDeallocate(this,mm); + picoos_deallocate(mm, (void *)&this); + return NULL; + } + tokInitialize(this); + return this; +} + +/** + * fill up internal buffer, try to locate token, write token to output + */ +picodata_step_result_t tokStep(register picodata_ProcessingUnit this, + picoos_int16 mode, picoos_uint16 * numBytesOutput) +{ + register tok_subobj_t * tok; + + if (NULL == this || NULL == this->subObj) { + return PICODATA_PU_ERROR; + } + tok = (tok_subobj_t *) this->subObj; + + mode = mode; /* avoid warning "var not used in this function"*/ + + *numBytesOutput = 0; + while (1) { /* exit via return */ + picoos_int16 ch; + + if ((tok->outWritePos - tok->outReadPos) > 0) { + if (picodata_cbPutItem(this->cbOut, &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos, numBytesOutput) == PICO_OK) { + PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG], + (picoos_uint8 *)"tok:", &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos); + tok->outReadPos += *numBytesOutput; + if (tok->outWritePos == tok->outReadPos) { + tok->outWritePos = 0; + tok->outReadPos = 0; + } + } + else { + return PICODATA_PU_OUT_FULL; + } + + } + else if (PICO_EOF != (ch = picodata_cbGetCh(this->cbIn))) { + PICODBG_DEBUG(("read in %c", (picoos_char) ch)); + tok_treatChar(this, tok, (picoos_uchar) ch, /*markupHandling*/TRUE); + } + else { + return PICODATA_PU_IDLE; + } + } +} + +#ifdef __cplusplus +} +#endif + +/* end */ |