summaryrefslogtreecommitdiffstats
path: root/WebKit/chromium/src/WebPageSerializerImpl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'WebKit/chromium/src/WebPageSerializerImpl.cpp')
-rw-r--r--WebKit/chromium/src/WebPageSerializerImpl.cpp547
1 files changed, 547 insertions, 0 deletions
diff --git a/WebKit/chromium/src/WebPageSerializerImpl.cpp b/WebKit/chromium/src/WebPageSerializerImpl.cpp
new file mode 100644
index 0000000..d5b2b7f
--- /dev/null
+++ b/WebKit/chromium/src/WebPageSerializerImpl.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (C) 2009 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// How we handle the base tag better.
+// Current status:
+// At now the normal way we use to handling base tag is
+// a) For those links which have corresponding local saved files, such as
+// savable CSS, JavaScript files, they will be written to relative URLs which
+// point to local saved file. Why those links can not be resolved as absolute
+// file URLs, because if they are resolved as absolute URLs, after moving the
+// file location from one directory to another directory, the file URLs will
+// be dead links.
+// b) For those links which have not corresponding local saved files, such as
+// links in A, AREA tags, they will be resolved as absolute URLs.
+// c) We comment all base tags when serialzing DOM for the page.
+// FireFox also uses above way to handle base tag.
+//
+// Problem:
+// This way can not handle the following situation:
+// the base tag is written by JavaScript.
+// For example. The page "www.yahoo.com" use
+// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
+// of page when loading page. So when saving page as completed-HTML, we assume
+// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
+// completed-HTML page, then the JavaScript will insert a base tag
+// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
+// local saved resource files will be resolved as
+// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
+// files can not be loaded correctly. Also the page will be rendered ugly since
+// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
+// files can not be fetched.
+// Now FireFox, IE and WebKit based Browser all have this problem.
+//
+// Solution:
+// My solution is that we comment old base tag and write new base tag:
+// <base href="." ...> after the previous commented base tag. In WebKit, it
+// always uses the latest "href" attribute of base tag to set document's base
+// URL. Based on this behavior, when we encounter a base tag, we comment it and
+// write a new base tag <base href="."> after the previous commented base tag.
+// The new added base tag can help engine to locate correct base URL for
+// correctly loading local saved resource files. Also I think we need to inherit
+// the base target value from document object when appending new base tag.
+// If there are multiple base tags in original document, we will comment all old
+// base tags and append new base tag after each old base tag because we do not
+// know those old base tags are original content or added by JavaScript. If
+// they are added by JavaScript, it means when loading saved page, the script(s)
+// will still insert base tag(s) to DOM, so the new added base tag(s) can
+// override the incorrect base URL and make sure we alway load correct local
+// saved resource files.
+
+#include "config.h"
+#include "WebPageSerializerImpl.h"
+
+#include "Document.h"
+#include "DocumentType.h"
+#include "Element.h"
+#include "FrameLoader.h"
+#include "HTMLAllCollection.h"
+#include "HTMLElement.h"
+#include "HTMLFormElement.h"
+#include "HTMLMetaElement.h"
+#include "HTMLNames.h"
+#include "KURL.h"
+#include "PlatformString.h"
+#include "StringBuilder.h"
+#include "TextEncoding.h"
+#include "markup.h"
+
+#include "DOMUtilitiesPrivate.h"
+#include "WebFrameImpl.h"
+#include "WebURL.h"
+#include "WebVector.h"
+
+using namespace WebCore;
+
+namespace WebKit {
+
+// Maximum length of data buffer which is used to temporary save generated
+// html content data. This is a soft limit which might be passed if a very large
+// contegious string is found in the page.
+static const unsigned dataBufferCapacity = 65536;
+
+WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& currentFrameURL,
+ const TextEncoding& textEncoding,
+ Document* doc,
+ const String& directoryName)
+ : currentFrameURL(currentFrameURL)
+ , textEncoding(textEncoding)
+ , doc(doc)
+ , directoryName(directoryName)
+ , hasDoctype(false)
+ , hasCheckedMeta(false)
+ , skipMetaElement(0)
+ , isInScriptOrStyleTag(false)
+ , hasDocDeclaration(false)
+{
+ // Cache the value since we check it lots of times.
+ isHTMLDocument = doc->isHTMLDocument();
+}
+
+String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
+ const Element* element, SerializeDomParam* param, bool* needSkip)
+{
+ StringBuilder result;
+
+ *needSkip = false;
+ if (param->isHTMLDocument) {
+ // Skip the open tag of original META tag which declare charset since we
+ // have overrided the META which have correct charset declaration after
+ // serializing open tag of HEAD element.
+ if (element->hasTagName(HTMLNames::metaTag)) {
+ const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element);
+ // Check whether the META tag has declared charset or not.
+ String equiv = meta->httpEquiv();
+ if (equalIgnoringCase(equiv, "content-type")) {
+ String content = meta->content();
+ if (content.length() && content.contains("charset", false)) {
+ // Find META tag declared charset, we need to skip it when
+ // serializing DOM.
+ param->skipMetaElement = element;
+ *needSkip = true;
+ }
+ }
+ } else if (element->hasTagName(HTMLNames::htmlTag)) {
+ // Check something before processing the open tag of HEAD element.
+ // First we add doc type declaration if original doc has it.
+ if (!param->hasDoctype) {
+ param->hasDoctype = true;
+ result.append(createMarkup(param->doc->doctype()));
+ }
+
+ // Add MOTW declaration before html tag.
+ // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
+ result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->currentFrameURL));
+ } else if (element->hasTagName(HTMLNames::baseTag)) {
+ // Comment the BASE tag when serializing dom.
+ result.append("<!--");
+ }
+ } else {
+ // Write XML declaration.
+ if (!param->hasDocDeclaration) {
+ param->hasDocDeclaration = true;
+ // Get encoding info.
+ String xmlEncoding = param->doc->xmlEncoding();
+ if (xmlEncoding.isEmpty())
+ xmlEncoding = param->doc->frame()->loader()->encoding();
+ if (xmlEncoding.isEmpty())
+ xmlEncoding = UTF8Encoding().name();
+ result.append("<?xml version=\"");
+ result.append(param->doc->xmlVersion());
+ result.append("\" encoding=\"");
+ result.append(xmlEncoding);
+ if (param->doc->xmlStandalone())
+ result.append("\" standalone=\"yes");
+ result.append("\"?>\n");
+ }
+ // Add doc type declaration if original doc has it.
+ if (!param->hasDoctype) {
+ param->hasDoctype = true;
+ result.append(createMarkup(param->doc->doctype()));
+ }
+ }
+ return result.toString();
+}
+
+String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
+ const Element* element, SerializeDomParam* param)
+{
+ StringBuilder result;
+
+ param->hasAddedContentsBeforeEnd = false;
+ if (!param->isHTMLDocument)
+ return result.toString();
+ // Check after processing the open tag of HEAD element
+ if (!param->hasCheckedMeta
+ && element->hasTagName(HTMLNames::headTag)) {
+ param->hasCheckedMeta = true;
+ // Check meta element. WebKit only pre-parse the first 512 bytes
+ // of the document. If the whole <HEAD> is larger and meta is the
+ // end of head part, then this kind of pages aren't decoded correctly
+ // because of this issue. So when we serialize the DOM, we need to
+ // make sure the meta will in first child of head tag.
+ // See http://bugs.webkit.org/show_bug.cgi?id=16621.
+ // First we generate new content for writing correct META element.
+ result.append(WebPageSerializer::generateMetaCharsetDeclaration(
+ String(param->textEncoding.name())));
+
+ param->hasAddedContentsBeforeEnd = true;
+ // Will search each META which has charset declaration, and skip them all
+ // in PreActionBeforeSerializeOpenTag.
+ } else if (element->hasTagName(HTMLNames::scriptTag)
+ || element->hasTagName(HTMLNames::styleTag)) {
+ param->isInScriptOrStyleTag = true;
+ }
+
+ return result.toString();
+}
+
+String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
+ const Element* element, SerializeDomParam* param, bool* needSkip)
+{
+ String result;
+
+ *needSkip = false;
+ if (!param->isHTMLDocument)
+ return result;
+ // Skip the end tag of original META tag which declare charset.
+ // Need not to check whether it's META tag since we guarantee
+ // skipMetaElement is definitely META tag if it's not 0.
+ if (param->skipMetaElement == element)
+ *needSkip = true;
+ else if (element->hasTagName(HTMLNames::scriptTag)
+ || element->hasTagName(HTMLNames::styleTag)) {
+ ASSERT(param->isInScriptOrStyleTag);
+ param->isInScriptOrStyleTag = false;
+ }
+
+ return result;
+}
+
+// After we finish serializing end tag of a element, we give the target
+// element a chance to do some post work to add some additional data.
+String WebPageSerializerImpl::postActionAfterSerializeEndTag(
+ const Element* element, SerializeDomParam* param)
+{
+ StringBuilder result;
+
+ if (!param->isHTMLDocument)
+ return result.toString();
+ // Comment the BASE tag when serializing DOM.
+ if (element->hasTagName(HTMLNames::baseTag)) {
+ result.append("-->");
+ // Append a new base tag declaration.
+ result.append(WebPageSerializer::generateBaseTagDeclaration(
+ param->doc->baseTarget()));
+ }
+
+ return result.toString();
+}
+
+void WebPageSerializerImpl::saveHTMLContentToBuffer(
+ const String& result, SerializeDomParam* param)
+{
+ m_dataBuffer.append(result);
+ encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
+ param,
+ 0);
+}
+
+void WebPageSerializerImpl::encodeAndFlushBuffer(
+ WebPageSerializerClient::PageSerializationStatus status,
+ SerializeDomParam* param,
+ bool force)
+{
+ // Data buffer is not full nor do we want to force flush.
+ if (!force && m_dataBuffer.length() <= dataBufferCapacity)
+ return;
+
+ String content = m_dataBuffer.toString();
+ m_dataBuffer.clear();
+
+ // Convert the unicode content to target encoding
+ CString encodedContent = param->textEncoding.encode(
+ content.characters(), content.length(), EntitiesForUnencodables);
+
+ // Send result to the client.
+ m_client->didSerializeDataForFrame(param->currentFrameURL,
+ WebCString(encodedContent.data(), encodedContent.length()),
+ status);
+}
+
+void WebPageSerializerImpl::openTagToString(const Element* element,
+ SerializeDomParam* param)
+{
+ // FIXME: use StringBuilder instead of String.
+ bool needSkip;
+ // Do pre action for open tag.
+ String result = preActionBeforeSerializeOpenTag(element, param, &needSkip);
+ if (needSkip)
+ return;
+ // Add open tag
+ result += "<" + element->nodeName();
+ // Go through all attributes and serialize them.
+ const NamedNodeMap *attrMap = element->attributes(true);
+ if (attrMap) {
+ unsigned numAttrs = attrMap->length();
+ for (unsigned i = 0; i < numAttrs; i++) {
+ result += " ";
+ // Add attribute pair
+ const Attribute *attribute = attrMap->attributeItem(i);
+ result += attribute->name().toString();
+ result += "=\"";
+ if (!attribute->value().isEmpty()) {
+ const String& attrValue = attribute->value();
+
+ // Check whether we need to replace some resource links
+ // with local resource paths.
+ const QualifiedName& attrName = attribute->name();
+ if (elementHasLegalLinkAttribute(element, attrName)) {
+ // For links start with "javascript:", we do not change it.
+ if (attrValue.startsWith("javascript:", false))
+ result += attrValue;
+ else {
+ // Get the absolute link
+ String completeURL = param->doc->completeURL(attrValue);
+ // Check whether we have local files for those link.
+ if (m_localLinks.contains(completeURL)) {
+ if (!m_localDirectoryName.isEmpty())
+ result += "./" + m_localDirectoryName + "/";
+ result += m_localLinks.get(completeURL);
+ } else
+ result += completeURL;
+ }
+ } else {
+ if (param->isHTMLDocument)
+ result += m_htmlEntities.convertEntitiesInString(attrValue);
+ else
+ result += m_xmlEntities.convertEntitiesInString(attrValue);
+ }
+ }
+ result += "\"";
+ }
+ }
+
+ // Do post action for open tag.
+ String addedContents = postActionAfterSerializeOpenTag(element, param);
+ // Complete the open tag for element when it has child/children.
+ if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd)
+ result += ">";
+ // Append the added contents generate in post action of open tag.
+ result += addedContents;
+ // Save the result to data buffer.
+ saveHTMLContentToBuffer(result, param);
+}
+
+// Serialize end tag of an specified element.
+void WebPageSerializerImpl::endTagToString(const Element* element,
+ SerializeDomParam* param)
+{
+ bool needSkip;
+ // Do pre action for end tag.
+ String result = preActionBeforeSerializeEndTag(element,
+ param,
+ &needSkip);
+ if (needSkip)
+ return;
+ // Write end tag when element has child/children.
+ if (element->hasChildNodes() || param->hasAddedContentsBeforeEnd) {
+ result += "</";
+ result += element->nodeName();
+ result += ">";
+ } else {
+ // Check whether we have to write end tag for empty element.
+ if (param->isHTMLDocument) {
+ result += ">";
+ const HTMLElement* htmlElement =
+ static_cast<const HTMLElement*>(element);
+ if (htmlElement->endTagRequirement() == TagStatusRequired) {
+ // We need to write end tag when it is required.
+ result += "</";
+ result += element->nodeName();
+ result += ">";
+ }
+ } else {
+ // For xml base document.
+ result += " />";
+ }
+ }
+ // Do post action for end tag.
+ result += postActionAfterSerializeEndTag(element, param);
+ // Save the result to data buffer.
+ saveHTMLContentToBuffer(result, param);
+}
+
+void WebPageSerializerImpl::buildContentForNode(const Node* node,
+ SerializeDomParam* param)
+{
+ switch (node->nodeType()) {
+ case Node::ELEMENT_NODE:
+ // Process open tag of element.
+ openTagToString(static_cast<const Element*>(node), param);
+ // Walk through the children nodes and process it.
+ for (const Node *child = node->firstChild(); child; child = child->nextSibling())
+ buildContentForNode(child, param);
+ // Process end tag of element.
+ endTagToString(static_cast<const Element*>(node), param);
+ break;
+ case Node::TEXT_NODE:
+ saveHTMLContentToBuffer(createMarkup(node), param);
+ break;
+ case Node::ATTRIBUTE_NODE:
+ case Node::DOCUMENT_NODE:
+ case Node::DOCUMENT_FRAGMENT_NODE:
+ // Should not exist.
+ ASSERT_NOT_REACHED();
+ break;
+ // Document type node can be in DOM?
+ case Node::DOCUMENT_TYPE_NODE:
+ param->hasDoctype = true;
+ default:
+ // For other type node, call default action.
+ saveHTMLContentToBuffer(createMarkup(node), param);
+ break;
+ }
+}
+
+WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
+ bool recursiveSerialization,
+ WebPageSerializerClient* client,
+ const WebVector<WebURL>& links,
+ const WebVector<WebString>& localPaths,
+ const WebString& localDirectoryName)
+ : m_client(client)
+ , m_recursiveSerialization(recursiveSerialization)
+ , m_framesCollected(false)
+ , m_localDirectoryName(localDirectoryName)
+ , m_htmlEntities(false)
+ , m_xmlEntities(true)
+{
+ // Must specify available webframe.
+ ASSERT(frame);
+ m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame);
+ // Make sure we have non 0 client.
+ ASSERT(client);
+ // Build local resources map.
+ ASSERT(links.size() == localPaths.size());
+ for (size_t i = 0; i < links.size(); i++) {
+ KURL url = links[i];
+ ASSERT(!m_localLinks.contains(url.string()));
+ m_localLinks.set(url.string(), localPaths[i]);
+ }
+
+ ASSERT(!m_dataBuffer.length());
+}
+
+void WebPageSerializerImpl::collectTargetFrames()
+{
+ ASSERT(!m_framesCollected);
+ m_framesCollected = true;
+
+ // First, process main frame.
+ m_frames.append(m_specifiedWebFrameImpl);
+ // Return now if user only needs to serialize specified frame, not including
+ // all sub-frames.
+ if (!m_recursiveSerialization)
+ return;
+ // Collect all frames inside the specified frame.
+ for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
+ WebFrameImpl* currentFrame = m_frames[i];
+ // Get current using document.
+ Document* currentDoc = currentFrame->frame()->document();
+ // Go through sub-frames.
+ RefPtr<HTMLAllCollection> all = currentDoc->all();
+ for (Node* node = all->firstItem(); node; node = all->nextItem()) {
+ if (!node->isHTMLElement())
+ continue;
+ Element* element = static_cast<Element*>(node);
+ WebFrameImpl* webFrame =
+ WebFrameImpl::fromFrameOwnerElement(element);
+ if (webFrame)
+ m_frames.append(webFrame);
+ }
+ }
+}
+
+bool WebPageSerializerImpl::serialize()
+{
+ // Collect target frames.
+ if (!m_framesCollected)
+ collectTargetFrames();
+ bool didSerialization = false;
+ // Get KURL for main frame.
+ KURL mainPageURL = m_specifiedWebFrameImpl->frame()->loader()->url();
+
+ // Go through all frames for serializing DOM for whole page, include
+ // sub-frames.
+ for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
+ // Get current serializing frame.
+ WebFrameImpl* currentFrame = m_frames[i];
+ // Get current using document.
+ Document* currentDoc = currentFrame->frame()->document();
+ // Get current frame's URL.
+ const KURL& currentFrameURL = currentFrame->frame()->loader()->url();
+
+ // Check whether we have done this document.
+ if (m_localLinks.contains(currentFrameURL.string())) {
+ // A new document, we will serialize it.
+ didSerialization = true;
+ // Get target encoding for current document.
+ String encoding = currentFrame->frame()->loader()->encoding();
+ // Create the text encoding object with target encoding.
+ TextEncoding textEncoding(encoding);
+ // Construct serialize parameter for late processing document.
+ SerializeDomParam param(currentFrameURL,
+ encoding.length() ? textEncoding : UTF8Encoding(),
+ currentDoc,
+ currentFrameURL == mainPageURL ? m_localDirectoryName : "");
+
+ // Process current document.
+ Element* rootElement = currentDoc->documentElement();
+ if (rootElement)
+ buildContentForNode(rootElement, &param);
+
+ // Flush the remainder data and finish serializing current frame.
+ encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished,
+ &param,
+ 1);
+ }
+ }
+
+ // We have done call frames, so we send message to embedder to tell it that
+ // frames are finished serializing.
+ ASSERT(!m_dataBuffer.length());
+ m_client->didSerializeDataForFrame(KURL(),
+ WebCString("", 0),
+ WebPageSerializerClient::AllFramesAreFinished);
+ return didSerialization;
+}
+
+} // namespace WebKit