/* * Copyright (C) 2006 Lars Knoll * Copyright (C) 2007 Apple Inc. All rights reserved. * Copyright (C) 2008 Jürg Billeter * Copyright (C) 2008 Dominik Röttsches * Copyright (C) 2010 Igalia S.L. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * */ #include "config.h" #include "TextBreakIterator.h" #include "GOwnPtr.h" #include using namespace std; #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) namespace WebCore { class CharacterIterator { public: bool setText(const UChar* string, int length); const gchar* getText() { return m_utf8.get(); } int getLength() { return m_length; } glong getSize() { return m_size; } void setIndex(int index); int getIndex() { return m_index; } void setUTF16Index(int index); int getUTF16Index() { return m_utf16Index; } int getUTF16Length() { return m_utf16Length; } int first(); int last(); int next(); int previous(); private: int characterSize(int index); GOwnPtr m_utf8; int m_length; long m_size; int m_index; int m_utf16Index; int m_utf16Length; }; int CharacterIterator::characterSize(int index) { if (index == m_length || index < 0) return 0; if (m_length == m_utf16Length) return 1; gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); gunichar character = g_utf8_get_char(indexPtr); return UTF8_IS_SURROGATE(character) ? 2 : 1; } bool CharacterIterator::setText(const UChar* string, int length) { long utf8Size = 0; m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); if (!utf8Size) return false; m_utf16Length = length; m_length = g_utf8_strlen(m_utf8.get(), utf8Size); m_size = utf8Size; m_index = 0; m_utf16Index = 0; return true; } void CharacterIterator::setIndex(int index) { if (index == m_index) return; if (index <= 0) m_index = m_utf16Index = 0; else if (index >= m_length) { m_index = m_length; m_utf16Index = m_utf16Length; } else if (m_length == m_utf16Length) m_index = m_utf16Index = index; else { m_index = index; int utf16Index = 0; int utf8Index = 0; while (utf8Index < index) { utf16Index += characterSize(utf8Index); utf8Index++; } m_utf16Index = utf16Index; } } void CharacterIterator::setUTF16Index(int index) { if (index == m_utf16Index) return; if (index <= 0) m_utf16Index = m_index = 0; else if (index >= m_utf16Length) { m_utf16Index = m_utf16Length; m_index = m_length; } else if (m_length == m_utf16Length) m_utf16Index = m_index = index; else { m_utf16Index = index; int utf16Index = 0; int utf8Index = 0; while (utf16Index < index) { utf16Index += characterSize(utf8Index); utf8Index++; } m_index = utf8Index; } } int CharacterIterator::first() { m_index = m_utf16Index = 0; return m_index; } int CharacterIterator::last() { m_index = m_length; m_utf16Index = m_utf16Length; return m_index; } int CharacterIterator::next() { int next = m_index + 1; if (next <= m_length) { m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); m_index = next; } else { m_index = TextBreakDone; m_utf16Index = TextBreakDone; } return m_index; } int CharacterIterator::previous() { int previous = m_index - 1; if (previous >= 0) { m_utf16Index = max(m_utf16Index - characterSize(previous), 0); m_index = previous; } else { m_index = TextBreakDone; m_utf16Index = TextBreakDone; } return m_index; } enum UBreakIteratorType { UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE }; class TextBreakIterator { public: UBreakIteratorType m_type; PangoLogAttr* m_logAttrs; CharacterIterator m_charIterator; }; static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, UBreakIteratorType type, const UChar* string, int length) { if (!string) return 0; if (!createdIterator) { iterator = new TextBreakIterator(); createdIterator = true; } if (!iterator) return 0; if (!iterator->m_charIterator.setText(string, length)) return 0; int charLength = iterator->m_charIterator.getLength(); iterator->m_type = type; if (createdIterator) g_free(iterator->m_logAttrs); iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), -1, 0, iterator->m_logAttrs, charLength + 1); return iterator; } TextBreakIterator* characterBreakIterator(const UChar* string, int length) { static bool createdCharacterBreakIterator = false; static TextBreakIterator* staticCharacterBreakIterator; return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); } TextBreakIterator* cursorMovementIterator(const UChar* string, int length) { // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. return characterBreakIterator(string, length); } TextBreakIterator* wordBreakIterator(const UChar* string, int length) { static bool createdWordBreakIterator = false; static TextBreakIterator* staticWordBreakIterator; return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); } static bool createdLineBreakIterator = false; static TextBreakIterator* staticLineBreakIterator; TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length) { TextBreakIterator* lineBreakIterator = 0; if (!createdLineBreakIterator || staticLineBreakIterator) { setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); swap(staticLineBreakIterator, lineBreakIterator); } if (!lineBreakIterator) { bool createdNewLineBreakIterator = false; setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length); } return lineBreakIterator; } void releaseLineBreakIterator(TextBreakIterator* iterator) { ASSERT(createdLineBreakIterator); ASSERT(iterator); if (!staticLineBreakIterator) staticLineBreakIterator = iterator; else delete iterator; } TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) { static bool createdSentenceBreakIterator = false; static TextBreakIterator* staticSentenceBreakIterator; return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); } int textBreakFirst(TextBreakIterator* iterator) { iterator->m_charIterator.first(); return iterator->m_charIterator.getUTF16Index(); } int textBreakLast(TextBreakIterator* iterator) { // TextBreakLast is not meant to find just any break according to bi->m_type // but really the one near the last character. // (cmp ICU documentation for ubrk_first and ubrk_last) // From ICU docs for ubrk_last: // "Determine the index immediately beyond the last character in the text being scanned." // So we should advance or traverse back based on bi->m_logAttrs cursor positions. // If last character position in the original string is a whitespace, // traverse to the left until the first non-white character position is found // and return the position of the first white-space char after this one. // Otherwise return m_length, as "the first character beyond the last" is outside our string. bool whiteSpaceAtTheEnd = true; int nextWhiteSpacePos = iterator->m_charIterator.getLength(); int pos = iterator->m_charIterator.last(); while (pos >= 0 && whiteSpaceAtTheEnd) { if (iterator->m_logAttrs[pos].is_cursor_position) { if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) nextWhiteSpacePos = pos; } pos = iterator->m_charIterator.previous(); } iterator->m_charIterator.setIndex(nextWhiteSpacePos); return iterator->m_charIterator.getUTF16Index(); } int textBreakNext(TextBreakIterator* iterator) { while (iterator->m_charIterator.next() != TextBreakDone) { int index = iterator->m_charIterator.getIndex(); // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €, // are not marked as word_start & word_end as opposed to the way ICU does it. // This leads to - for example - different word selection behaviour when right clicking. if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { break; } } return iterator->m_charIterator.getUTF16Index(); } int textBreakPrevious(TextBreakIterator* iterator) { while (iterator->m_charIterator.previous() != TextBreakDone) { int index = iterator->m_charIterator.getIndex(); if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { break; } } return iterator->m_charIterator.getUTF16Index(); } int textBreakPreceding(TextBreakIterator* iterator, int offset) { if (offset > iterator->m_charIterator.getUTF16Length()) return TextBreakDone; if (offset < 0) return 0; iterator->m_charIterator.setUTF16Index(offset); return textBreakPrevious(iterator); } int textBreakFollowing(TextBreakIterator* iterator, int offset) { if (offset > iterator->m_charIterator.getUTF16Length()) return TextBreakDone; if (offset < 0) return 0; iterator->m_charIterator.setUTF16Index(offset); return textBreakNext(iterator); } int textBreakCurrent(TextBreakIterator* iterator) { return iterator->m_charIterator.getUTF16Index(); } bool isTextBreak(TextBreakIterator* iterator, int offset) { if (!offset) return true; if (offset > iterator->m_charIterator.getUTF16Length()) return false; iterator->m_charIterator.setUTF16Index(offset); int index = iterator->m_charIterator.getIndex(); iterator->m_charIterator.previous(); textBreakNext(iterator); return iterator->m_charIterator.getIndex() == index; } }