/* * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "LegacyWebArchive.h" #include "Cache.h" #include "Document.h" #include "DocumentLoader.h" #include "Frame.h" #include "FrameLoader.h" #include "FrameTree.h" #include "HTMLFrameOwnerElement.h" #include "HTMLNames.h" #include "IconDatabase.h" #include "Image.h" #include "KURLHash.h" #include "Logging.h" #include "markup.h" #include "Node.h" #include "Range.h" #include "SelectionController.h" #include "SharedBuffer.h" #include #include #include #include namespace WebCore { static const CFStringRef LegacyWebArchiveMainResourceKey = CFSTR("WebMainResource"); static const CFStringRef LegacyWebArchiveSubresourcesKey = CFSTR("WebSubresources"); static const CFStringRef LegacyWebArchiveSubframeArchivesKey = CFSTR("WebSubframeArchives"); static const CFStringRef LegacyWebArchiveResourceDataKey = CFSTR("WebResourceData"); static const CFStringRef LegacyWebArchiveResourceFrameNameKey = CFSTR("WebResourceFrameName"); static const CFStringRef LegacyWebArchiveResourceMIMETypeKey = CFSTR("WebResourceMIMEType"); static const CFStringRef LegacyWebArchiveResourceURLKey = CFSTR("WebResourceURL"); static const CFStringRef LegacyWebArchiveResourceTextEncodingNameKey = CFSTR("WebResourceTextEncodingName"); static const CFStringRef LegacyWebArchiveResourceResponseKey = CFSTR("WebResourceResponse"); static const CFStringRef LegacyWebArchiveResourceResponseVersionKey = CFSTR("WebResourceResponseVersion"); RetainPtr LegacyWebArchive::createPropertyListRepresentation(ArchiveResource* resource, MainResourceStatus isMainResource) { if (!resource) { // The property list representation of a null/empty WebResource has the following 3 objects stored as nil. // FIXME: 0 is not serializable. Presumably we need to use kCFNull here instead for compatibility. // FIXME: But why do we need to support a resource of 0? Who relies on that? RetainPtr propertyList(AdoptCF, CFDictionaryCreateMutable(0, 3, 0, 0)); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceDataKey, 0); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceURLKey, 0); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceMIMETypeKey, 0); return propertyList; } RetainPtr propertyList(AdoptCF, CFDictionaryCreateMutable(0, 6, 0, &kCFTypeDictionaryValueCallBacks)); // Resource data can be empty, but must be represented by an empty CFDataRef SharedBuffer* data = resource->data(); RetainPtr cfData; if (data) cfData.adoptCF(data->createCFData()); else cfData.adoptCF(CFDataCreate(0, 0, 0)); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceDataKey, cfData.get()); // Resource URL cannot be null RetainPtr cfURL(AdoptCF, resource->url().string().createCFString()); if (cfURL) CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceURLKey, cfURL.get()); else { LOG(Archives, "LegacyWebArchive - NULL resource URL is invalid - returning null property list"); return 0; } // FrameName should be left out if empty for subresources, but always included for main resources const String& frameName(resource->frameName()); if (!frameName.isEmpty() || isMainResource) { RetainPtr cfFrameName(AdoptCF, frameName.createCFString()); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceFrameNameKey, cfFrameName.get()); } // Set MIMEType, TextEncodingName, and ResourceResponse only if they actually exist const String& mimeType(resource->mimeType()); if (!mimeType.isEmpty()) { RetainPtr cfMIMEType(AdoptCF, mimeType.createCFString()); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceMIMETypeKey, cfMIMEType.get()); } const String& textEncoding(resource->textEncoding()); if (!textEncoding.isEmpty()) { RetainPtr cfTextEncoding(AdoptCF, textEncoding.createCFString()); CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceTextEncodingNameKey, cfTextEncoding.get()); } // Don't include the resource response for the main resource if (!isMainResource) { RetainPtr resourceResponseData = createPropertyListRepresentation(resource->response()); if (resourceResponseData) CFDictionarySetValue(propertyList.get(), LegacyWebArchiveResourceResponseKey, resourceResponseData.get()); } return propertyList; } RetainPtr LegacyWebArchive::createPropertyListRepresentation(Archive* archive) { RetainPtr propertyList(AdoptCF, CFDictionaryCreateMutable(0, 3, 0, &kCFTypeDictionaryValueCallBacks)); RetainPtr mainResourceDict = createPropertyListRepresentation(archive->mainResource(), MainResource); ASSERT(mainResourceDict); if (!mainResourceDict) return 0; CFDictionarySetValue(propertyList.get(), LegacyWebArchiveMainResourceKey, mainResourceDict.get()); RetainPtr subresourcesArray(AdoptCF, CFArrayCreateMutable(0, archive->subresources().size(), &kCFTypeArrayCallBacks)); const Vector >& subresources(archive->subresources()); for (unsigned i = 0; i < subresources.size(); ++i) { RetainPtr subresource = createPropertyListRepresentation(subresources[i].get(), Subresource); if (subresource) CFArrayAppendValue(subresourcesArray.get(), subresource.get()); else LOG(Archives, "LegacyWebArchive - Failed to create property list for subresource"); } if (CFArrayGetCount(subresourcesArray.get())) CFDictionarySetValue(propertyList.get(), LegacyWebArchiveSubresourcesKey, subresourcesArray.get()); RetainPtr subframesArray(AdoptCF, CFArrayCreateMutable(0, archive->subframeArchives().size(), &kCFTypeArrayCallBacks)); const Vector >& subframeArchives(archive->subframeArchives()); for (unsigned i = 0; i < subframeArchives.size(); ++i) { RetainPtr subframeArchive = createPropertyListRepresentation(subframeArchives[i].get()); if (subframeArchive) CFArrayAppendValue(subframesArray.get(), subframeArchive.get()); else LOG(Archives, "LegacyWebArchive - Failed to create property list for subframe archive"); } if (CFArrayGetCount(subframesArray.get())) CFDictionarySetValue(propertyList.get(), LegacyWebArchiveSubframeArchivesKey, subframesArray.get()); return propertyList; } ResourceResponse LegacyWebArchive::createResourceResponseFromPropertyListData(CFDataRef data, CFStringRef responseDataType) { ASSERT(data); if (!data) return ResourceResponse(); // If the ResourceResponseVersion (passed in as responseDataType) exists at all, this is a "new" web archive that we // can parse well in a cross platform manner If it doesn't exist, we will assume this is an "old" web archive with, // NSURLResponse objects in it and parse the ResourceResponse as such. if (!responseDataType) return createResourceResponseFromMacArchivedData(data); // FIXME: Parse the "new" format that the above comment references here. This format doesn't exist yet. return ResourceResponse(); } PassRefPtr LegacyWebArchive::createResource(CFDictionaryRef dictionary) { ASSERT(dictionary); if (!dictionary) return 0; CFDataRef resourceData = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceDataKey)); if (resourceData && CFGetTypeID(resourceData) != CFDataGetTypeID()) { LOG(Archives, "LegacyWebArchive - Resource data is not of type CFData, cannot create invalid resource"); return 0; } CFStringRef frameName = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceFrameNameKey)); if (frameName && CFGetTypeID(frameName) != CFStringGetTypeID()) { LOG(Archives, "LegacyWebArchive - Frame name is not of type CFString, cannot create invalid resource"); return 0; } CFStringRef mimeType = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceMIMETypeKey)); if (!mimeType || CFGetTypeID(mimeType) != CFStringGetTypeID()) { LOG(Archives, "LegacyWebArchive - MIME type is not of type CFString, cannot create invalid resource"); return 0; } CFStringRef url = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceURLKey)); if (url && CFGetTypeID(url) != CFStringGetTypeID()) { LOG(Archives, "LegacyWebArchive - URL is not of type CFString, cannot create invalid resource"); return 0; } CFStringRef textEncoding = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceTextEncodingNameKey)); if (textEncoding && CFGetTypeID(textEncoding) != CFStringGetTypeID()) { LOG(Archives, "LegacyWebArchive - Text encoding is not of type CFString, cannot create invalid resource"); return 0; } ResourceResponse response; CFDataRef resourceResponseData = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceResponseKey)); if (resourceResponseData) { if (CFGetTypeID(resourceResponseData) != CFDataGetTypeID()) { LOG(Archives, "LegacyWebArchive - Resource response data is not of type CFData, cannot create invalid resource"); return 0; } CFStringRef resourceResponseVersion = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveResourceResponseVersionKey)); if (resourceResponseVersion && CFGetTypeID(resourceResponseVersion) != CFStringGetTypeID()) { LOG(Archives, "LegacyWebArchive - Resource response version is not of type CFString, cannot create invalid resource"); return 0; } response = createResourceResponseFromPropertyListData(resourceResponseData, resourceResponseVersion); } return ArchiveResource::create(SharedBuffer::wrapCFData(resourceData), KURL(KURL(), url), mimeType, textEncoding, frameName, response); } PassRefPtr LegacyWebArchive::create() { return adoptRef(new LegacyWebArchive); } PassRefPtr LegacyWebArchive::create(PassRefPtr mainResource, Vector >& subresources, Vector >& subframeArchives) { ASSERT(mainResource); if (!mainResource) return 0; RefPtr archive = create(); archive->setMainResource(mainResource); for (unsigned i = 0; i < subresources.size(); ++i) archive->addSubresource(subresources[i]); for (unsigned i = 0; i < subframeArchives.size(); ++i) archive->addSubframeArchive(subframeArchives[i]); return archive.release(); } PassRefPtr LegacyWebArchive::create(SharedBuffer* data) { LOG(Archives, "LegacyWebArchive - Creating from raw data"); RefPtr archive = create(); ASSERT(data); if (!data) return 0; RetainPtr cfData(AdoptCF, data->createCFData()); if (!cfData) return 0; CFStringRef errorString = 0; RetainPtr plist(AdoptCF, static_cast(CFPropertyListCreateFromXMLData(0, cfData.get(), kCFPropertyListImmutable, &errorString))); if (!plist) { #ifndef NDEBUG const char* cError = errorString ? CFStringGetCStringPtr(errorString, kCFStringEncodingUTF8) : "unknown error"; LOG(Archives, "LegacyWebArchive - Error parsing PropertyList from archive data - %s", cError); #endif if (errorString) CFRelease(errorString); return 0; } if (CFGetTypeID(plist.get()) != CFDictionaryGetTypeID()) { LOG(Archives, "LegacyWebArchive - Archive property list is not the expected CFDictionary, aborting invalid WebArchive"); return 0; } if (!archive->extract(plist.get())) return 0; return archive.release(); } bool LegacyWebArchive::extract(CFDictionaryRef dictionary) { ASSERT(dictionary); if (!dictionary) { LOG(Archives, "LegacyWebArchive - Null root CFDictionary, aborting invalid WebArchive"); return false; } CFDictionaryRef mainResourceDict = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveMainResourceKey)); if (!mainResourceDict) { LOG(Archives, "LegacyWebArchive - No main resource in archive, aborting invalid WebArchive"); return false; } if (CFGetTypeID(mainResourceDict) != CFDictionaryGetTypeID()) { LOG(Archives, "LegacyWebArchive - Main resource is not the expected CFDictionary, aborting invalid WebArchive"); return false; } setMainResource(createResource(mainResourceDict)); if (!mainResource()) { LOG(Archives, "LegacyWebArchive - Failed to parse main resource from CFDictionary or main resource does not exist, aborting invalid WebArchive"); return false; } CFArrayRef subresourceArray = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveSubresourcesKey)); if (subresourceArray && CFGetTypeID(subresourceArray) != CFArrayGetTypeID()) { LOG(Archives, "LegacyWebArchive - Subresources is not the expected Array, aborting invalid WebArchive"); return false; } if (subresourceArray) { CFIndex count = CFArrayGetCount(subresourceArray); for (CFIndex i = 0; i < count; ++i) { CFDictionaryRef subresourceDict = static_cast(CFArrayGetValueAtIndex(subresourceArray, i)); if (CFGetTypeID(subresourceDict) != CFDictionaryGetTypeID()) { LOG(Archives, "LegacyWebArchive - Subresource is not expected CFDictionary, aborting invalid WebArchive"); return false; } addSubresource(createResource(subresourceDict)); } } CFArrayRef subframeArray = static_cast(CFDictionaryGetValue(dictionary, LegacyWebArchiveSubframeArchivesKey)); if (subframeArray && CFGetTypeID(subframeArray) != CFArrayGetTypeID()) { LOG(Archives, "LegacyWebArchive - Subframe archives is not the expected Array, aborting invalid WebArchive"); return false; } if (subframeArray) { CFIndex count = CFArrayGetCount(subframeArray); for (CFIndex i = 0; i < count; ++i) { CFDictionaryRef subframeDict = static_cast(CFArrayGetValueAtIndex(subframeArray, i)); if (CFGetTypeID(subframeDict) != CFDictionaryGetTypeID()) { LOG(Archives, "LegacyWebArchive - Subframe array is not expected CFDictionary, aborting invalid WebArchive"); return false; } RefPtr subframeArchive = create(); if (subframeArchive->extract(subframeDict)) addSubframeArchive(subframeArchive.release()); else LOG(Archives, "LegacyWebArchive - Invalid subframe archive skipped"); } } return true; } RetainPtr LegacyWebArchive::rawDataRepresentation() { RetainPtr propertyList = createPropertyListRepresentation(this); ASSERT(propertyList); if (!propertyList) { LOG(Archives, "LegacyWebArchive - Failed to create property list for archive, returning no data"); return 0; } RetainPtr stream(AdoptCF, CFWriteStreamCreateWithAllocatedBuffers(0, 0)); CFWriteStreamOpen(stream.get()); CFPropertyListWriteToStream(propertyList.get(), stream.get(), kCFPropertyListBinaryFormat_v1_0, 0); RetainPtr plistData(AdoptCF, static_cast(CFWriteStreamCopyProperty(stream.get(), kCFStreamPropertyDataWritten))); ASSERT(plistData); CFWriteStreamClose(stream.get()); if (!plistData) { LOG(Archives, "LegacyWebArchive - Failed to convert property list into raw data, returning no data"); return 0; } return plistData; } #if !PLATFORM(MAC) ResourceResponse LegacyWebArchive::createResourceResponseFromMacArchivedData(CFDataRef responseData) { // FIXME: If is is possible to parse in a serialized NSURLResponse manually, without using // NSKeyedUnarchiver, manipulating plists directly, then we want to do that here. // Until then, this can be done on Mac only. return ResourceResponse(); } RetainPtr LegacyWebArchive::createPropertyListRepresentation(const ResourceResponse& response) { // FIXME: Write out the "new" format described in createResourceResponseFromPropertyListData once we invent it. return 0; } #endif PassRefPtr LegacyWebArchive::create(Node* node) { ASSERT(node); if (!node) return create(); Document* document = node->document(); Frame* frame = document ? document->frame() : 0; if (!frame) return create(); Vector nodeList; String markupString = createMarkup(node, IncludeNode, &nodeList); Node::NodeType nodeType = node->nodeType(); if (nodeType != Node::DOCUMENT_NODE && nodeType != Node::DOCUMENT_TYPE_NODE) markupString = frame->documentTypeString() + markupString; return create(markupString, frame, nodeList); } PassRefPtr LegacyWebArchive::create(Frame* frame) { ASSERT(frame); DocumentLoader* documentLoader = frame->loader()->documentLoader(); if (!documentLoader) return 0; Vector > subframeArchives; unsigned children = frame->tree()->childCount(); for (unsigned i = 0; i < children; ++i) { RefPtr childFrameArchive = create(frame->tree()->child(i)); if (childFrameArchive) subframeArchives.append(childFrameArchive.release()); } Vector > subresources; documentLoader->getSubresources(subresources); return create(documentLoader->mainResource(), subresources, subframeArchives); } PassRefPtr LegacyWebArchive::create(Range* range) { if (!range) return 0; Node* startContainer = range->startContainer(); if (!startContainer) return 0; Document* document = startContainer->document(); if (!document) return 0; Frame* frame = document->frame(); if (!frame) return 0; Vector nodeList; // FIXME: This is always "for interchange". Is that right? See the previous method. String markupString = frame->documentTypeString() + createMarkup(range, &nodeList, AnnotateForInterchange); return create(markupString, frame, nodeList); } PassRefPtr LegacyWebArchive::create(const String& markupString, Frame* frame, const Vector& nodes) { ASSERT(frame); const ResourceResponse& response = frame->loader()->documentLoader()->response(); KURL responseURL = response.url(); // it's possible to have a response without a URL here // if (responseURL.isNull()) responseURL = KURL(ParsedURLString, ""); PassRefPtr mainResource = ArchiveResource::create(utf8Buffer(markupString), responseURL, response.mimeType(), "UTF-8", frame->tree()->name()); Vector > subframeArchives; Vector > subresources; HashSet uniqueSubresources; size_t nodesSize = nodes.size(); for (size_t i = 0; i < nodesSize; ++i) { Node* node = nodes[i]; Frame* childFrame; if ((node->hasTagName(HTMLNames::frameTag) || node->hasTagName(HTMLNames::iframeTag) || node->hasTagName(HTMLNames::objectTag)) && (childFrame = static_cast(node)->contentFrame())) { RefPtr subframeArchive = create(childFrame->document()); if (subframeArchive) subframeArchives.append(subframeArchive); else LOG_ERROR("Unabled to archive subframe %s", childFrame->tree()->name().string().utf8().data()); } else { ListHashSet subresourceURLs; node->getSubresourceURLs(subresourceURLs); DocumentLoader* documentLoader = frame->loader()->documentLoader(); ListHashSet::iterator iterEnd = subresourceURLs.end(); for (ListHashSet::iterator iter = subresourceURLs.begin(); iter != iterEnd; ++iter) { const KURL& subresourceURL = *iter; if (uniqueSubresources.contains(subresourceURL)) continue; uniqueSubresources.add(subresourceURL); RefPtr resource = documentLoader->subresource(subresourceURL); if (resource) { subresources.append(resource.release()); continue; } CachedResource *cachedResource = cache()->resourceForURL(subresourceURL); if (cachedResource) { resource = ArchiveResource::create(cachedResource->data(), subresourceURL, cachedResource->response()); if (resource) { subresources.append(resource.release()); continue; } } // FIXME: should do something better than spew to console here LOG_ERROR("Failed to archive subresource for %s", subresourceURL.string().utf8().data()); } } } // Add favicon if one exists for this page, if we are archiving the entire page. if (nodesSize && nodes[0]->isDocumentNode() && iconDatabase() && iconDatabase()->isEnabled()) { const String& iconURL = iconDatabase()->iconURLForPageURL(responseURL); if (!iconURL.isEmpty() && iconDatabase()->iconDataKnownForIconURL(iconURL)) { if (Image* iconImage = iconDatabase()->iconForPageURL(responseURL, IntSize(16, 16))) { if (RefPtr resource = ArchiveResource::create(iconImage->data(), KURL(ParsedURLString, iconURL), "image/x-icon", "", "")) subresources.append(resource.release()); } } } return create(mainResource, subresources, subframeArchives); } PassRefPtr LegacyWebArchive::createFromSelection(Frame* frame) { if (!frame) return 0; RefPtr selectionRange = frame->selection()->toNormalizedRange(); Vector nodeList; String markupString = frame->documentTypeString() + createMarkup(selectionRange.get(), &nodeList, AnnotateForInterchange); RefPtr archive = create(markupString, frame, nodeList); if (!frame->document() || !frame->document()->isFrameSet()) return archive.release(); // Wrap the frameset document in an iframe so it can be pasted into // another document (which will have a body or frameset of its own). String iframeMarkup = makeString(""); RefPtr iframeResource = ArchiveResource::create(utf8Buffer(iframeMarkup), blankURL(), "text/html", "UTF-8", String()); Vector > subresources; Vector > subframeArchives; subframeArchives.append(archive); archive = create(iframeResource.release(), subresources, subframeArchives); return archive.release(); } }