diff options
Diffstat (limited to 'lib/picotok.h')
-rw-r--r-- | lib/picotok.h | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/lib/picotok.h b/lib/picotok.h new file mode 100644 index 0000000..b602408 --- /dev/null +++ b/lib/picotok.h @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file picotok.h + * + * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland + * All rights reserved. + * + * History: + * - 2009-04-20 -- initial version + * + */ + + +/** @addtogroup picotok +itemtype, iteminfo1, iteminfo2, content -> TYPE(INFO1,INFO2)content +in the following + +input +===== + +- UTF8 text + +limitations: currently only german umlauts in addition to ASCII + + +minimal input size (before processing starts) +================== + +processing (ie. tokenization) starts when +- 'PICO_EOF' char received (which happens whenever the cbIn buffer is empty) +- tok-internal buffer is full + + +items output +============ + +processing the character stream can result in one of the +following items: +-> WORDGRAPH(NA,NA)graph <- mapped to lower case; incl. 1-2 digit nrs (0-99) +-> OTHER(NA,NA)string <- skip or spell +-> PUNC(PUNCtype,PUNCsubtype) +-> CMD(CMDtype,CMDsubtype)args + +with +- PUNCtype %d + PICODATA_ITEMINFO1_PUNC_SENTEND + PICODATA_ITEMINFO1_PUNC_PHRASEEND +- PUNCsubtype %d + PICODATA_ITEMINFO2_PUNC_SENT_T + PICODATA_ITEMINFO2_PUNC_SENT_Q + PICODATA_ITEMINFO2_PUNC_SENT_E + PICODATA_ITEMINFO2_PUNC_PHRASE + (used later: PICODATA_ITEMINFO2_PUNC_PHRASE_FORCED) +- CMDtype %d + PICODATA_ITEMINFO1_CMD_FLUSH (no args) + ? PICODATA_ITEMINFO1_CMD_PLAY ? (not yet) +- CMDsubtype %d + PICODATA_ITEMINFO2_NA + ? PICODATA_ITEMINFO2_CMD_PLAY_G2P ? (not yet) +- graph, len>0, utf8 graphemes, %s +- string, len>0, can be any string with printable ascii characters, %s + + +other limitations +================= + +- item size: header plus len=256 (valid for Pico in general) + */ + + +#ifndef PICOTOK_H_ +#define PICOTOK_H_ + +#include "picoos.h" +#include "picodata.h" +#include "picorsrc.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if 0 +} +#endif + + + +picodata_ProcessingUnit picotok_newTokenizeUnit( + picoos_MemoryManager mm, + picoos_Common common, + picodata_CharBuffer cbIn, + picodata_CharBuffer cbOut, + picorsrc_Voice voice); + +#define PICOTOK_OUTBUF_SIZE 256 + +#ifdef __cplusplus +} +#endif + + +#endif /*PICOTOK_H_*/ |