diff options
Diffstat (limited to 'Tools/Scripts/webkitpy/layout_tests/deduplicate_tests.py')
-rw-r--r-- | Tools/Scripts/webkitpy/layout_tests/deduplicate_tests.py | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/Tools/Scripts/webkitpy/layout_tests/deduplicate_tests.py b/Tools/Scripts/webkitpy/layout_tests/deduplicate_tests.py new file mode 100644 index 0000000..51dcac8 --- /dev/null +++ b/Tools/Scripts/webkitpy/layout_tests/deduplicate_tests.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python +# Copyright (C) 2010 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""deduplicate_tests -- lists duplicated between platforms. + +If platform/mac-leopard is missing an expected test output, we fall back on +platform/mac. This means it's possible to grow redundant test outputs, +where we have the same expected data in both a platform directory and another +platform it falls back on. +""" + +import collections +import fnmatch +import os +import subprocess +import sys +import re +import webkitpy.common.checkout.scm as scm +import webkitpy.common.system.executive as executive +import webkitpy.common.system.logutils as logutils +import webkitpy.common.system.ospath as ospath +import webkitpy.layout_tests.port.factory as port_factory + +_log = logutils.get_logger(__file__) + +_BASE_PLATFORM = 'base' + + +def port_fallbacks(): + """Get the port fallback information. + Returns: + A dictionary mapping platform name to a list of other platforms to fall + back on. All platforms fall back on 'base'. + """ + fallbacks = {_BASE_PLATFORM: []} + platform_dir = os.path.join(scm.find_checkout_root(), 'LayoutTests', + 'platform') + for port_name in os.listdir(platform_dir): + try: + platforms = port_factory.get(port_name).baseline_search_path() + except NotImplementedError: + _log.error("'%s' lacks baseline_search_path(), please fix." + % port_name) + fallbacks[port_name] = [_BASE_PLATFORM] + continue + fallbacks[port_name] = [os.path.basename(p) for p in platforms][1:] + fallbacks[port_name].append(_BASE_PLATFORM) + return fallbacks + + +def parse_git_output(git_output, glob_pattern): + """Parses the output of git ls-tree and filters based on glob_pattern. + Args: + git_output: result of git ls-tree -r HEAD LayoutTests. + glob_pattern: a pattern to filter the files. + Returns: + A dictionary mapping (test name, hash of content) => [paths] + """ + hashes = collections.defaultdict(set) + for line in git_output.split('\n'): + if not line: + break + attrs, path = line.strip().split('\t') + if not fnmatch.fnmatch(path, glob_pattern): + continue + path = path[len('LayoutTests/'):] + match = re.match(r'^(platform/.*?/)?(.*)', path) + test = match.group(2) + _, _, hash = attrs.split(' ') + hashes[(test, hash)].add(path) + return hashes + + +def cluster_file_hashes(glob_pattern): + """Get the hashes of all the test expectations in the tree. + We cheat and use git's hashes. + Args: + glob_pattern: a pattern to filter the files. + Returns: + A dictionary mapping (test name, hash of content) => [paths] + """ + + # A map of file hash => set of all files with that hash. + hashes = collections.defaultdict(set) + + # Fill in the map. + cmd = ('git', 'ls-tree', '-r', 'HEAD', 'LayoutTests') + try: + git_output = executive.Executive().run_command(cmd, + cwd=scm.find_checkout_root()) + except OSError, e: + if e.errno == 2: # No such file or directory. + _log.error("Error: 'No such file' when running git.") + _log.error("This script requires git.") + sys.exit(1) + raise e + return parse_git_output(git_output, glob_pattern) + + +def extract_platforms(paths): + """Extracts the platforms from a list of paths matching ^platform/(.*?)/. + Args: + paths: a list of paths. + Returns: + A dictionary containing all platforms from paths. + """ + platforms = {} + for path in paths: + match = re.match(r'^platform/(.*?)/', path) + if match: + platform = match.group(1) + else: + platform = _BASE_PLATFORM + platforms[platform] = path + return platforms + + +def has_intermediate_results(test, fallbacks, matching_platform, + path_exists=os.path.exists): + """Returns True if there is a test result that causes us to not delete + this duplicate. + + For example, chromium-linux may be a duplicate of the checked in result, + but chromium-win may have a different result checked in. In this case, + we need to keep the duplicate results. + + Args: + test: The test name. + fallbacks: A list of platforms we fall back on. + matching_platform: The platform that we found the duplicate test + result. We can stop checking here. + path_exists: Optional parameter that allows us to stub out + os.path.exists for testing. + """ + for platform in fallbacks: + if platform == matching_platform: + return False + test_path = os.path.join('LayoutTests', 'platform', platform, test) + if path_exists(test_path): + return True + return False + + +def get_relative_test_path(filename, relative_to, + checkout_root=scm.find_checkout_root()): + """Constructs a relative path to |filename| from |relative_to|. + Args: + filename: The test file we're trying to get a relative path to. + relative_to: The absolute path we're relative to. + Returns: + A relative path to filename or None if |filename| is not below + |relative_to|. + """ + layout_test_dir = os.path.join(checkout_root, 'LayoutTests') + abs_path = os.path.join(layout_test_dir, filename) + return ospath.relpath(abs_path, relative_to) + + +def find_dups(hashes, port_fallbacks, relative_to): + """Yields info about redundant test expectations. + Args: + hashes: a list of hashes as returned by cluster_file_hashes. + port_fallbacks: a list of fallback information as returned by + get_port_fallbacks. + relative_to: the directory that we want the results relative to + Returns: + a tuple containing (test, platform, fallback, platforms) + """ + for (test, hash), cluster in hashes.items(): + if len(cluster) < 2: + continue # Common case: only one file with that hash. + + # Compute the list of platforms we have this particular hash for. + platforms = extract_platforms(cluster) + if len(platforms) == 1: + continue + + # See if any of the platforms are redundant with each other. + for platform in platforms.keys(): + for fallback in port_fallbacks[platform]: + if fallback not in platforms.keys(): + continue + # We have to verify that there isn't an intermediate result + # that causes this duplicate hash to exist. + if has_intermediate_results(test, port_fallbacks[platform], + fallback): + continue + # We print the relative path so it's easy to pipe the results + # to xargs rm. + path = get_relative_test_path(platforms[platform], relative_to) + if not path: + continue + yield { + 'test': test, + 'platform': platform, + 'fallback': fallback, + 'path': path, + } + + +def deduplicate(glob_pattern): + """Traverses LayoutTests and returns information about duplicated files. + Args: + glob pattern to filter the files in LayoutTests. + Returns: + a dictionary containing test, path, platform and fallback. + """ + fallbacks = port_fallbacks() + hashes = cluster_file_hashes(glob_pattern) + return list(find_dups(hashes, fallbacks, os.getcwd())) |