diff options
author | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 19:29:09 -0800 |
---|---|---|
committer | The Android Open Source Project <initial-contribution@android.com> | 2009-03-03 19:29:09 -0800 |
commit | 55a2c71f27d3e0b8344597c7f281e687cb7aeb1b (patch) | |
tree | ecd18b995aea8eeeb8b3823266280d41245bf0f7 /scripts/app_engine_server/memcache_zipserve.py | |
parent | 82ea7a177797b844b252effea5c7c7c5d63ea4ac (diff) | |
download | sdk-55a2c71f27d3e0b8344597c7f281e687cb7aeb1b.zip sdk-55a2c71f27d3e0b8344597c7f281e687cb7aeb1b.tar.gz sdk-55a2c71f27d3e0b8344597c7f281e687cb7aeb1b.tar.bz2 |
auto import from //depot/cupcake/@135843
Diffstat (limited to 'scripts/app_engine_server/memcache_zipserve.py')
-rw-r--r-- | scripts/app_engine_server/memcache_zipserve.py | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/scripts/app_engine_server/memcache_zipserve.py b/scripts/app_engine_server/memcache_zipserve.py new file mode 100644 index 0000000..e11cfc5 --- /dev/null +++ b/scripts/app_engine_server/memcache_zipserve.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python +# +# Copyright 2009 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""A class to serve pages from zip files and use memcache for performance. + +This contains a class and a function to create an anonymous instance of the +class to serve HTTP GET requests. Memcache is used to increase response speed +and lower processing cycles used in serving. Credit to Guido van Rossum and +his implementation of zipserve which served as a reference as I wrote this. + + MemcachedZipHandler: Class that serves request + create_handler: method to create instance of MemcachedZipHandler +""" + +__author__ = 'jmatt@google.com (Justin Mattson)' + +import email.Utils +import logging +import mimetypes +import time +import zipfile + +from google.appengine.api import memcache +from google.appengine.ext import webapp +from google.appengine.ext.webapp import util + + +def create_handler(zip_files, max_age=None, public=None): + """Factory method to create a MemcachedZipHandler instance. + + Args: + zip_files: A list of file names, or a list of lists of file name, first + member of file mappings. See MemcachedZipHandler documentation for + more information about using the list of lists format + max_age: The maximum client-side cache lifetime + public: Whether this should be declared public in the client-side cache + Returns: + A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App + Engine + + Raises: + ValueError: if the zip_files argument is not a list + """ + # verify argument integrity. If the argument is passed in list format, + # convert it to list of lists format + + if zip_files and type(zip_files).__name__ == 'list': + num_items = len(zip_files) + while num_items > 0: + if type(zip_files[num_items - 1]).__name__ != 'list': + zip_files[num_items - 1] = [zip_files[num_items-1]] + num_items -= 1 + else: + raise ValueError('File name arguments must be a list') + + class HandlerWrapper(MemcachedZipHandler): + """Simple wrapper for an instance of MemcachedZipHandler. + + I'm still not sure why this is needed + """ + + def get(self, name): + self.zipfilenames = zip_files + self.TrueGet(name) + if max_age is not None: + MAX_AGE = max_age + if public is not None: + PUBLIC = public + + return HandlerWrapper + + +class MemcachedZipHandler(webapp.RequestHandler): + """Handles get requests for a given URL. + + Serves a GET request from a series of zip files. As files are served they are + put into memcache, which is much faster than retreiving them from the zip + source file again. It also uses considerably fewer CPU cycles. + """ + zipfile_cache = {} # class cache of source zip files + MAX_AGE = 600 # max client-side cache lifetime + PUBLIC = True # public cache setting + CACHE_PREFIX = 'cache://' # memcache key prefix for actual URLs + NEG_CACHE_PREFIX = 'noncache://' # memcache key prefix for non-existant URL + + def TrueGet(self, name): + """The top-level entry point to serving requests. + + Called 'True' get because it does the work when called from the wrapper + class' get method + + Args: + name: URL requested + + Returns: + None + """ + name = self.PreprocessUrl(name) + + # see if we have the page in the memcache + resp_data = self.GetFromCache(name) + if resp_data is None: + logging.info('Cache miss for %s', name) + resp_data = self.GetFromNegativeCache(name) + if resp_data is None: + resp_data = self.GetFromStore(name) + + # IF we have the file, put it in the memcache + # ELSE put it in the negative cache + if resp_data is not None: + self.StoreOrUpdateInCache(name, resp_data) + else: + logging.info('Adding %s to negative cache, serving 404', name) + self.StoreInNegativeCache(name) + self.Write404Error() + return + else: + self.Write404Error() + return + + content_type, encoding = mimetypes.guess_type(name) + if content_type: + self.response.headers['Content-Type'] = content_type + self.SetCachingHeaders() + self.response.out.write(resp_data) + + def PreprocessUrl(self, name): + """Any preprocessing work on the URL when it comes it. + + Put any work related to interpretting the incoming URL here. For example, + this is used to redirect requests for a directory to the index.html file + in that directory. Subclasses should override this method to do different + preprocessing. + + Args: + name: The incoming URL + + Returns: + The processed URL + """ + # handle special case of requesting the domain itself + if not name: + name = 'index.html' + + # determine if this is a request for a directory + final_path_segment = name + final_slash_offset = name.rfind('/') + if final_slash_offset != len(name) - 1: + final_path_segment = name[final_slash_offset + 1:] + if final_path_segment.find('.') == -1: + name = ''.join([name, '/']) + + # if this is a directory, redirect to index.html + if name[len(name) - 1:] == '/': + return '%s%s' % (name, 'index.html') + else: + return name + + def GetFromStore(self, file_path): + """Retrieve file from zip files. + + Get the file from the source, it must not have been in the memcache. If + possible, we'll use the zip file index to quickly locate where the file + should be found. (See MapToFileArchive documentation for assumptions about + file ordering.) If we don't have an index or don't find the file where the + index says we should, look through all the zip files to find it. + + Args: + file_path: the file that we're looking for + + Returns: + The contents of the requested file + """ + resp_data = None + file_itr = iter(self.zipfilenames) + + # check the index, if we have one, to see what archive the file is in + archive_name = self.MapFileToArchive(file_path) + if not archive_name: + archive_name = file_itr.next()[0] + + while resp_data is None and archive_name: + zip_archive = self.LoadZipFile(archive_name) + if zip_archive: + + # we expect some lookups will fail, and that's okay, 404s will deal + # with that + try: + resp_data = zip_archive.read(file_path) + except (KeyError, RuntimeError), err: + # no op + x = False + if resp_data is not None: + logging.info('%s read from %s', file_path, archive_name) + + try: + archive_name = file_itr.next()[0] + except (StopIteration), err: + archive_name = False + + return resp_data + + def LoadZipFile(self, zipfilename): + """Convenience method to load zip file. + + Just a convenience method to load the zip file from the data store. This is + useful if we ever want to change data stores and also as a means of + dependency injection for testing. This method will look at our file cache + first, and then load and cache the file if there's a cache miss + + Args: + zipfilename: the name of the zip file to load + + Returns: + The zip file requested, or None if there is an I/O error + """ + zip_archive = None + zip_archive = self.zipfile_cache.get(zipfilename) + if zip_archive is None: + try: + zip_archive = zipfile.ZipFile(zipfilename) + self.zipfile_cache[zipfilename] = zip_archive + except (IOError, RuntimeError), err: + logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename, + err)) + return zip_archive + + def MapFileToArchive(self, file_path): + """Given a file name, determine what archive it should be in. + + This method makes two critical assumptions. + (1) The zip files passed as an argument to the handler, if concatenated + in that same order, would result in a total ordering + of all the files. See (2) for ordering type. + (2) Upper case letters before lower case letters. The traversal of a + directory tree is depth first. A parent directory's files are added + before the files of any child directories + + Args: + file_path: the file to be mapped to an archive + + Returns: + The name of the archive where we expect the file to be + """ + num_archives = len(self.zipfilenames) + while num_archives > 0: + target = self.zipfilenames[num_archives - 1] + if len(target) > 1: + if self.CompareFilenames(target[1], file_path) >= 0: + return target[0] + num_archives -= 1 + + return None + + def CompareFilenames(self, file1, file2): + """Determines whether file1 is lexigraphically 'before' file2. + + WARNING: This method assumes that paths are output in a depth-first, + with parent directories' files stored before childs' + + We say that file1 is lexigraphically before file2 if the last non-matching + path segment of file1 is alphabetically before file2. + + Args: + file1: the first file path + file2: the second file path + + Returns: + A positive number if file1 is before file2 + A negative number if file2 is before file1 + 0 if filenames are the same + """ + f1_segments = file1.split('/') + f2_segments = file2.split('/') + + segment_ptr = 0 + while (segment_ptr < len(f1_segments) and + segment_ptr < len(f2_segments) and + f1_segments[segment_ptr] == f2_segments[segment_ptr]): + segment_ptr += 1 + + if len(f1_segments) == len(f2_segments): + + # we fell off the end, the paths much be the same + if segment_ptr == len(f1_segments): + return 0 + + # we didn't fall of the end, compare the segments where they differ + if f1_segments[segment_ptr] < f2_segments[segment_ptr]: + return 1 + elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: + return -1 + else: + return 0 + + # the number of segments differs, we either mismatched comparing + # directories, or comparing a file to a directory + else: + + # IF we were looking at the last segment of one of the paths, + # the one with fewer segments is first because files come before + # directories + # ELSE we just need to compare directory names + if (segment_ptr + 1 == len(f1_segments) or + segment_ptr + 1 == len(f2_segments)): + return len(f2_segments) - len(f1_segments) + else: + if f1_segments[segment_ptr] < f2_segments[segment_ptr]: + return 1 + elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: + return -1 + else: + return 0 + + def SetCachingHeaders(self): + """Set caching headers for the request.""" + max_age = self.MAX_AGE + self.response.headers['Expires'] = email.Utils.formatdate( + time.time() + max_age, usegmt=True) + cache_control = [] + if self.PUBLIC: + cache_control.append('public') + cache_control.append('max-age=%d' % max_age) + self.response.headers['Cache-Control'] = ', '.join(cache_control) + + def GetFromCache(self, filename): + """Get file from memcache, if available. + + Args: + filename: The URL of the file to return + + Returns: + The content of the file + """ + return memcache.get('%s%s' % (self.CACHE_PREFIX, filename)) + + def StoreOrUpdateInCache(self, filename, data): + """Store data in the cache. + + Store a piece of data in the memcache. Memcache has a maximum item size of + 1*10^6 bytes. If the data is too large, fail, but log the failure. Future + work will consider compressing the data before storing or chunking it + + Args: + filename: the name of the file to store + data: the data of the file + + Returns: + None + """ + try: + if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data): + memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data) + except (ValueError), err: + logging.warning('Data size too large to cache\n%s' % err) + + def Write404Error(self): + """Ouptut a simple 404 response.""" + self.error(404) + self.response.out.write( + ''.join(['<html><head><title>404: Not Found</title></head>', + '<body><b><h2>Error 404</h2><br/>', + 'File not found</b></body></html>'])) + + def StoreInNegativeCache(self, filename): + """If a non-existant URL is accessed, cache this result as well. + + Future work should consider setting a maximum negative cache size to + prevent it from from negatively impacting the real cache. + + Args: + filename: URL to add ot negative cache + + Returns: + None + """ + memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1) + + def GetFromNegativeCache(self, filename): + """Retrieve from negative cache. + + Args: + filename: URL to retreive + + Returns: + The file contents if present in the negative cache. + """ + return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename)) + + +def main(): + application = webapp.WSGIApplication([('/([^/]+)/(.*)', + MemcachedZipHandler)]) + util.run_wsgi_app(application) + + +if __name__ == '__main__': + main() |