diff options
Diffstat (limited to 'Source/WebKit/chromium/src/WebPageSerializer.cpp')
-rw-r--r-- | Source/WebKit/chromium/src/WebPageSerializer.cpp | 177 |
1 files changed, 176 insertions, 1 deletions
diff --git a/Source/WebKit/chromium/src/WebPageSerializer.cpp b/Source/WebKit/chromium/src/WebPageSerializer.cpp index 1fda484..c8c7529 100644 --- a/Source/WebKit/chromium/src/WebPageSerializer.cpp +++ b/Source/WebKit/chromium/src/WebPageSerializer.cpp @@ -31,19 +31,152 @@ #include "config.h" #include "WebPageSerializer.h" +#include "DocumentLoader.h" +#include "Element.h" +#include "Frame.h" +#include "HTMLAllCollection.h" +#include "HTMLFrameOwnerElement.h" +#include "HTMLInputElement.h" +#include "HTMLNames.h" #include "KURL.h" +#include "Vector.h" +#include "WebCString.h" #include "WebFrame.h" +#include "WebFrameImpl.h" #include "WebPageSerializerClient.h" #include "WebPageSerializerImpl.h" #include "WebString.h" #include "WebURL.h" #include "WebVector.h" +#include "WebView.h" #include <wtf/text/StringConcatenate.h> using namespace WebCore; +namespace { + +KURL getSubResourceURLFromElement(Element* element) +{ + ASSERT(element); + const QualifiedName* attributeName = 0; + if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag)) + attributeName = &HTMLNames::srcAttr; + else if (element->hasTagName(HTMLNames::inputTag)) { + HTMLInputElement* input = static_cast<HTMLInputElement*>(element); + if (input->isImageButton()) + attributeName = &HTMLNames::srcAttr; + } else if (element->hasTagName(HTMLNames::bodyTag) + || element->hasTagName(HTMLNames::tableTag) + || element->hasTagName(HTMLNames::trTag) + || element->hasTagName(HTMLNames::tdTag)) + attributeName = &HTMLNames::backgroundAttr; + else if (element->hasTagName(HTMLNames::blockquoteTag) + || element->hasTagName(HTMLNames::qTag) + || element->hasTagName(HTMLNames::delTag) + || element->hasTagName(HTMLNames::insTag)) + attributeName = &HTMLNames::citeAttr; + else if (element->hasTagName(HTMLNames::linkTag)) { + // If the link element is not css, ignore it. + if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) { + // FIXME: Add support for extracting links of sub-resources which + // are inside style-sheet such as @import, @font-face, url(), etc. + attributeName = &HTMLNames::hrefAttr; + } + } else if (element->hasTagName(HTMLNames::objectTag)) + attributeName = &HTMLNames::dataAttr; + else if (element->hasTagName(HTMLNames::embedTag)) + attributeName = &HTMLNames::srcAttr; + + if (!attributeName) + return KURL(); + + String value = element->getAttribute(*attributeName); + // Ignore javascript content. + if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false)) + return KURL(); + + return element->document()->completeURL(value); +} + +void retrieveResourcesForElement(Element* element, + Vector<Frame*>* visitedFrames, + Vector<Frame*>* framesToVisit, + Vector<KURL>* frameURLs, + Vector<KURL>* resourceURLs) +{ + // If the node is a frame, we'll process it later in retrieveResourcesForFrame. + if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag) + || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag)) + && element->isFrameOwnerElement()) { + Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame(); + if (frame) { + if (!visitedFrames->contains(frame)) + framesToVisit->append(frame); + return; + } + } + + KURL url = getSubResourceURLFromElement(element); + if (url.isEmpty() || !url.isValid()) + return; // No subresource for this node. + + // Ignore URLs that have a non-standard protocols. Since the FTP protocol + // does no have a cache mechanism, we skip it as well. + if (!url.protocolInHTTPFamily() && !url.isLocalFile()) + return; + + if (!resourceURLs->contains(url)) + resourceURLs->append(url); +} + +void retrieveResourcesForFrame(Frame* frame, + const WebKit::WebVector<WebKit::WebCString>& supportedSchemes, + Vector<Frame*>* visitedFrames, + Vector<Frame*>* framesToVisit, + Vector<KURL>* frameURLs, + Vector<KURL>* resourceURLs) +{ + KURL frameURL = frame->loader()->documentLoader()->request().url(); + + // If the frame's URL is invalid, ignore it, it is not retrievable. + if (!frameURL.isValid()) + return; + + // Ignore frames from unsupported schemes. + bool isValidScheme = false; + for (size_t i = 0; i < supportedSchemes.size(); ++i) { + if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) { + isValidScheme = true; + break; + } + } + if (!isValidScheme) + return; + + // If we have already seen that frame, ignore it. + if (visitedFrames->contains(frame)) + return; + visitedFrames->append(frame); + if (!frameURLs->contains(frameURL)) + frameURLs->append(frameURL); + + // Now get the resources associated with each node of the document. + RefPtr<HTMLAllCollection> allNodes = frame->document()->all(); + for (unsigned i = 0; i < allNodes->length(); ++i) { + Node* node = allNodes->item(i); + // We are only interested in HTML resources. + if (!node->isElementNode()) + continue; + retrieveResourcesForElement(static_cast<Element*>(node), + visitedFrames, framesToVisit, + frameURLs, resourceURLs); + } +} + +} // namespace + namespace WebKit { bool WebPageSerializer::serialize(WebFrame* frame, @@ -58,6 +191,48 @@ bool WebPageSerializer::serialize(WebFrame* frame, return serializerImpl.serialize(); } +bool WebPageSerializer::retrieveAllResources(WebView* view, + const WebVector<WebCString>& supportedSchemes, + WebVector<WebURL>* resourceURLs, + WebVector<WebURL>* frameURLs) { + WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame()); + if (!mainFrame) + return false; + + Vector<Frame*> framesToVisit; + Vector<Frame*> visitedFrames; + Vector<KURL> frameKURLs; + Vector<KURL> resourceKURLs; + + // Let's retrieve the resources from every frame in this page. + framesToVisit.append(mainFrame->frame()); + while (!framesToVisit.isEmpty()) { + Frame* frame = framesToVisit[0]; + framesToVisit.remove(0); + retrieveResourcesForFrame(frame, supportedSchemes, + &visitedFrames, &framesToVisit, + &frameKURLs, &resourceKURLs); + } + + // Converts the results to WebURLs. + WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); + for (size_t i = 0; i < resourceKURLs.size(); ++i) { + resultResourceURLs[i] = resourceKURLs[i]; + // A frame's src can point to the same URL as another resource, keep the + // resource URL only in such cases. + size_t index = frameKURLs.find(resourceKURLs[i]); + if (index != notFound) + frameKURLs.remove(index); + } + *resourceURLs = resultResourceURLs; + WebVector<WebURL> resultFrameURLs(frameKURLs.size()); + for (size_t i = 0; i < frameKURLs.size(); ++i) + resultFrameURLs[i] = frameKURLs[i]; + *frameURLs = resultFrameURLs; + + return true; +} + WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset) { return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">"); @@ -77,4 +252,4 @@ WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">"); } -} // namespace WebKit +} // namespace WebKit |