/* * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @file picokfst.c * * FST knowledge loading and access * * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland * All rights reserved. * * History: * - 2009-04-20 -- initial version * */ #include "picoos.h" #include "picodbg.h" #include "picoknow.h" #include "picokfst.h" #ifdef __cplusplus extern "C" { #endif #if 0 } #endif #define FileHdrSize 4 /* size of FST file header */ /* ************************************************************/ /* function to create specialized kb, */ /* to be used by picorsrc only */ /* ************************************************************/ /** object : FSTKnowledgeBase * shortcut : kfst * derived from : picoknow_KnowledgeBase */ typedef struct kfst_subobj * kfst_SubObj; typedef struct kfst_subobj{ picoos_uint8 * fstStream; /* the byte stream base address */ picoos_int32 hdrLen; /* length of file header */ picoos_int32 transductionMode; /* transduction mode to be used for FST */ picoos_int32 nrClasses; /* nr of pair/transition classes in FST; class is in [1..nrClasses] */ picoos_int32 nrStates; /* nr of states in FST; state is in [1..nrState] */ picoos_int32 termClass; /* pair class of terminator symbol pair; probably obsolete */ picoos_int32 alphaHashTabSize; /* size of pair alphabet hash table */ picoos_int32 alphaHashTabPos; /* absolute address of the start of the pair alphabet */ picoos_int32 transTabEntrySize; /* size in bytes of each transition table entry */ picoos_int32 transTabPos; /* absolute address of the start of the transition table */ picoos_int32 inEpsStateTabPos; /* absolute address of the start of the input epsilon transition table */ picoos_int32 accStateTabPos; /* absolute address of the table of accepting states */ } kfst_subobj_t; /* ************************************************************/ /* primitives for reading from byte stream */ /* ************************************************************/ /* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'. '*pos' is modified to the position right after the number */ static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num) { picoos_int32 i; (*num) = 0; for (i = 0; i < nrBytes; i++) { (*num) = ((*num) << 8) + (picoos_uint32)stream[*pos]; (*pos)++; } } /* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'. '*pos' is modified to the position right after the number */ static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num) { picoos_int32 i; picoos_uint32 val; val = 0; for (i = 0; i < nrBytes; i++) { val = (val << 8) + (picoos_uint32)stream[*pos]; (*pos)++; } if (val % 2 == 1) { /* negative number */ (*num) = -((picoos_int32)((val - 1) / 2)) - 1; } else { /* positive number */ (*num) = val / 2; } } /* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream' into (signed) number 'num'. '*pos' is modified to the position right after the number. */ static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num) { picoos_uint32 val; picoos_uint32 b; val = 0; b = (picoos_uint32)stream[*pos]; (*pos)++; while (b < 128) { val = (val << 7) + b; b = (picoos_uint32)stream[*pos]; (*pos)++; } val = (val << 7) + (b - 128); if (val % 2 == 1) { /* negative number */ (*num) = -((picoos_int32)((val - 1) / 2)) - 1; } else { /* positive number */ (*num) = val / 2; } } /* ************************************************************/ /* setting up FST from byte stream */ /* ************************************************************/ static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this, picoos_Common common) { picoos_uint32 curpos; picoos_int32 offs; kfst_subobj_t * kfst; PICODBG_DEBUG(("kfstInitialize -- start\n")); if (NULL == this || NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } kfst = (kfst_subobj_t *) this->subObj; /* +CT+ */ kfst->fstStream = this->base; PICODBG_TRACE(("base: %d\n",this->base)); kfst->hdrLen = FileHdrSize; curpos = kfst->hdrLen; BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode); BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses); BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates); BytesToNum(kfst->fstStream,& curpos,& kfst->termClass); BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize); BytesToNum(kfst->fstStream,& curpos,& offs); kfst->alphaHashTabPos = kfst->hdrLen + offs; BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize); BytesToNum(kfst->fstStream,& curpos,& offs); kfst->transTabPos = kfst->hdrLen + offs; BytesToNum(kfst->fstStream,& curpos,& offs); kfst->inEpsStateTabPos = kfst->hdrLen + offs; BytesToNum(kfst->fstStream,& curpos,& offs); kfst->accStateTabPos = kfst->hdrLen + offs; /* -CT- */ return PICO_OK; } static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this, picoos_MemoryManager mm) { if (NULL != this) { picoos_deallocate(mm, (void *) &this->subObj); } return PICO_OK; } /* calculates a small number of data (e.g. addresses) from kb for fast access. * This data is encapsulated in a picokfst_FST that can later be retrieved * with picokfst_getFST. */ pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this, picoos_Common common) { pico_status_t status; if (NULL == this) { return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL); } if (0 < this->size) { /* not a dummy kb */ this->subDeallocate = kfstSubObjDeallocate; this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t)); if (NULL == this->subObj) { return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); } status = kfstInitialize(this, common); if (PICO_OK != status) { picoos_deallocate(common->mm,(void **)&this->subObj); } } return PICO_OK; } /* ************************************************************/ /* FST type and getFST function */ /* ************************************************************/ /* return kb FST for usage in PU */ picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this) { if (NULL == this) { return NULL; } else { return (picokfst_FST) this->subObj; } } /* ************************************************************/ /* FST access methods */ /* ************************************************************/ /* see description in header file */ extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this) { kfst_SubObj fst = (kfst_SubObj) this; if (fst != NULL) { return fst->transductionMode; } else { return 0; } } /* see description in header file */ extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses) { kfst_SubObj fst = (kfst_SubObj) this; if (fst != NULL) { *nrStates = fst->nrStates; *nrClasses = fst->nrClasses; } else { *nrStates = 0; *nrClasses = 0; } } /* see description in header file */ extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym, picoos_bool * inSymFound, picoos_int32 * searchState) { picoos_uint32 pos; picoos_int32 offs; picoos_int32 h; picoos_int32 inSymCellPos; picoos_int32 inSymX; picoos_int32 nextSameHashInSymOffs; kfst_SubObj fst = (kfst_SubObj) this; (*searchState) = -1; (*inSymFound) = 0; h = inSym % fst->alphaHashTabSize; pos = fst->alphaHashTabPos + (h * 4); FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs); if (offs > 0) { inSymCellPos = fst->alphaHashTabPos + offs; pos = inSymCellPos; BytesToNum(fst->fstStream,& pos,& inSymX); BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs); while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) { inSymCellPos = inSymCellPos + nextSameHashInSymOffs; pos = inSymCellPos; BytesToNum(fst->fstStream,& pos,& inSymX); BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs); } if (inSymX == inSym) { /* input symbol found; state is set to position after symbol cell */ (*searchState) = pos; (*inSymFound) = 1; } } } /* see description in header file */ extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState, picoos_bool * pairFound, picokfst_symid_t * outSym, picokfst_class_t * pairClass) { picoos_uint32 pos; picoos_int32 val; kfst_SubObj fst = (kfst_SubObj) this; if ((*searchState) < 0) { (*pairFound) = 0; (*outSym) = PICOKFST_SYMID_ILLEG; (*pairClass) = -1; } else { pos = (*searchState); BytesToNum(fst->fstStream,& pos,& val); *outSym = (picokfst_symid_t)val; if ((*outSym) != PICOKFST_SYMID_ILLEG) { BytesToNum(fst->fstStream,& pos,& val); *pairClass = (picokfst_class_t)val; (*pairFound) = 1; (*searchState) = pos; } else { (*pairFound) = 0; (*outSym) = PICOKFST_SYMID_ILLEG; (*pairClass) = -1; (*searchState) = -1; } } } /* see description in header file */ extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass, picokfst_state_t * endState) { picoos_uint32 pos; picoos_int32 index; picoos_uint32 endStateX; kfst_SubObj fst = (kfst_SubObj) this; if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) { (*endState) = 0; } else { index = (startState - 1) * fst->nrClasses + transClass - 1; pos = fst->transTabPos + (index * fst->transTabEntrySize); FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX); (*endState) = endStateX; } } /* see description in header file */ extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState, picoos_bool * inEpsTransFound, picoos_int32 * searchState) { picoos_int32 offs; picoos_uint32 pos; kfst_SubObj fst = (kfst_SubObj) this; (*searchState) = -1; (*inEpsTransFound) = 0; if ((startState > 0) && (startState <= fst->nrStates)) { pos = fst->inEpsStateTabPos + (startState - 1) * 4; FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs); if (offs > 0) { (*searchState) = fst->inEpsStateTabPos + offs; (*inEpsTransFound) = 1; } } } /* see description in header file */ extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState, picoos_bool * inEpsTransFound, picokfst_symid_t * outSym, picokfst_state_t * endState) { picoos_uint32 pos; picoos_int32 val; kfst_SubObj fst = (kfst_SubObj) this; if ((*searchState) < 0) { (*inEpsTransFound) = 0; (*outSym) = PICOKFST_SYMID_ILLEG; (*endState) = 0; } else { pos = (*searchState); BytesToNum(fst->fstStream,& pos,& val); *outSym = (picokfst_symid_t)val; if ((*outSym) != PICOKFST_SYMID_ILLEG) { BytesToNum(fst->fstStream,& pos,& val); *endState = (picokfst_state_t)val; (*inEpsTransFound) = 1; (*searchState) = pos; } else { (*inEpsTransFound) = 0; (*outSym) = PICOKFST_SYMID_ILLEG; (*endState) = 0; (*searchState) = -1; } } } /* see description in header file */ extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state) { picoos_uint32 pos; picoos_uint32 val; kfst_SubObj fst = (kfst_SubObj) this; if ((state > 0) && (state <= fst->nrStates)) { pos = fst->accStateTabPos + (state - 1); FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val); return (val == 1); } else { return 0; } } #ifdef __cplusplus } #endif /* End picofst.c */