/* * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @file picotok.h * * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * All rights reserved. * * History: * - 2009-04-20 -- initial version * */ /** @addtogroup picotok itemtype, iteminfo1, iteminfo2, content -> TYPE(INFO1,INFO2)content in the following input ===== - UTF8 text limitations: currently only german umlauts in addition to ASCII minimal input size (before processing starts) ================== processing (ie. tokenization) starts when - 'PICO_EOF' char received (which happens whenever the cbIn buffer is empty) - tok-internal buffer is full items output ============ processing the character stream can result in one of the following items: -> WORDGRAPH(NA,NA)graph <- mapped to lower case; incl. 1-2 digit nrs (0-99) -> OTHER(NA,NA)string <- skip or spell -> PUNC(PUNCtype,PUNCsubtype) -> CMD(CMDtype,CMDsubtype)args with - PUNCtype %d PICODATA_ITEMINFO1_PUNC_SENTEND PICODATA_ITEMINFO1_PUNC_PHRASEEND - PUNCsubtype %d PICODATA_ITEMINFO2_PUNC_SENT_T PICODATA_ITEMINFO2_PUNC_SENT_Q PICODATA_ITEMINFO2_PUNC_SENT_E PICODATA_ITEMINFO2_PUNC_PHRASE (used later: PICODATA_ITEMINFO2_PUNC_PHRASE_FORCED) - CMDtype %d PICODATA_ITEMINFO1_CMD_FLUSH (no args) ? PICODATA_ITEMINFO1_CMD_PLAY ? (not yet) - CMDsubtype %d PICODATA_ITEMINFO2_NA ? PICODATA_ITEMINFO2_CMD_PLAY_G2P ? (not yet) - graph, len>0, utf8 graphemes, %s - string, len>0, can be any string with printable ascii characters, %s other limitations ================= - item size: header plus len=256 (valid for Pico in general) */ #ifndef PICOTOK_H_ #define PICOTOK_H_ #include "picoos.h" #include "picodata.h" #include "picorsrc.h" #ifdef __cplusplus extern "C" { #endif #if 0 } #endif picodata_ProcessingUnit picotok_newTokenizeUnit( picoos_MemoryManager mm, picoos_Common common, picodata_CharBuffer cbIn, picodata_CharBuffer cbOut, picorsrc_Voice voice); #define PICOTOK_OUTBUF_SIZE 256 #ifdef __cplusplus } #endif #endif /*PICOTOK_H_*/