lib/rubocop/result_cache.rb
# frozen_string_literal: true
require 'digest/sha1'
require 'find'
require 'zlib'
require_relative 'cache_config'
module RuboCop
# Provides functionality for caching RuboCop runs.
# @api private
class ResultCache
NON_CHANGING = %i[color format formatters out debug fail_level
fix_layout autocorrect safe_autocorrect autocorrect_all
cache fail_fast stdin parallel].freeze
DL_EXTENSIONS = ::RbConfig::CONFIG
.values_at('DLEXT', 'DLEXT2')
.reject { |ext| !ext || ext.empty? }
.map { |ext| ".#{ext}" }
.freeze
# Remove old files so that the cache doesn't grow too big. When the
# threshold MaxFilesInCache has been exceeded, the oldest 50% of all the
# files in the cache are removed. The reason for removing so much is that
# cleaning should be done relatively seldom, since there is a slight risk
# that some other RuboCop process was just about to read the file, when
# there's parallel execution and the cache is shared.
def self.cleanup(config_store, verbose, cache_root = nil)
return if inhibit_cleanup # OPTIMIZE: For faster testing
cache_root ||= cache_root(config_store)
return unless File.exist?(cache_root)
files, dirs = Find.find(cache_root).partition { |path| File.file?(path) }
return unless requires_file_removal?(files.length, config_store)
remove_oldest_files(files, dirs, cache_root, verbose)
end
class << self
# @api private
attr_accessor :rubocop_required_features
ResultCache.rubocop_required_features = []
private
def requires_file_removal?(file_count, config_store)
file_count > 1 && file_count > config_store.for_pwd.for_all_cops['MaxFilesInCache']
end
def remove_oldest_files(files, dirs, cache_root, verbose)
# Add 1 to half the number of files, so that we remove the file if
# there's only 1 left.
remove_count = (files.length / 2) + 1
puts "Removing the #{remove_count} oldest files from #{cache_root}" if verbose
sorted = files.sort_by { |path| File.mtime(path) }
remove_files(sorted, dirs, remove_count)
rescue Errno::ENOENT
# This can happen if parallel RuboCop invocations try to remove the
# same files. No problem.
puts $ERROR_INFO if verbose
end
def remove_files(files, dirs, remove_count)
# Batch file deletions, deleting over 130,000+ files will crash
# File.delete.
files[0, remove_count].each_slice(10_000).each do |files_slice|
File.delete(*files_slice)
end
dirs.each { |dir| Dir.rmdir(dir) if Dir["#{dir}/*"].empty? }
end
end
def self.cache_root(config_store)
CacheConfig.root_dir do
config_store.for_pwd.for_all_cops['CacheRootDirectory']
end
end
def self.allow_symlinks_in_cache_location?(config_store)
config_store.for_pwd.for_all_cops['AllowSymlinksInCacheRootDirectory']
end
attr_reader :path
def initialize(file, team, options, config_store, cache_root = nil)
cache_root ||= File.join(options[:cache_root], 'rubocop_cache') if options[:cache_root]
cache_root ||= ResultCache.cache_root(config_store)
@allow_symlinks_in_cache_location =
ResultCache.allow_symlinks_in_cache_location?(config_store)
@path = File.join(cache_root,
rubocop_checksum,
context_checksum(team, options),
file_checksum(file, config_store))
@cached_data = CachedData.new(file)
@debug = options[:debug]
end
def debug?
@debug
end
def valid?
File.exist?(@path)
end
def load
puts "Loading cache from #{@path}" if debug?
@cached_data.from_json(File.read(@path, encoding: Encoding::UTF_8))
end
def save(offenses)
dir = File.dirname(@path)
begin
FileUtils.mkdir_p(dir)
rescue Errno::EACCES, Errno::EROFS => e
warn "Couldn't create cache directory. Continuing without cache.\n #{e.message}"
return
end
preliminary_path = "#{@path}_#{rand(1_000_000_000)}"
# RuboCop must be in control of where its cached data is stored. A
# symbolic link anywhere in the cache directory tree can be an
# indication that a symlink attack is being waged.
return if symlink_protection_triggered?(dir)
File.open(preliminary_path, 'w', encoding: Encoding::UTF_8) do |f|
f.write(@cached_data.to_json(offenses))
end
# The preliminary path is used so that if there are multiple RuboCop
# processes trying to save data for the same inspected file
# simultaneously, the only problem we run in to is a competition who gets
# to write to the final file. The contents are the same, so no corruption
# of data should occur.
FileUtils.mv(preliminary_path, @path)
end
private
def symlink_protection_triggered?(path)
!@allow_symlinks_in_cache_location && any_symlink?(path)
end
def any_symlink?(path)
while path != File.dirname(path)
if File.symlink?(path)
warn "Warning: #{path} is a symlink, which is not allowed."
return true
end
path = File.dirname(path)
end
false
end
def file_checksum(file, config_store)
digester = Digest::SHA1.new
mode = File.stat(file).mode
digester.update("#{file}#{mode}#{config_store.for_file(file).signature}")
digester.file(file)
digester.hexdigest
rescue Errno::ENOENT
# Spurious files that come and go should not cause a crash, at least not
# here.
'_'
end
class << self
attr_accessor :source_checksum, :inhibit_cleanup
end
# The checksum of the RuboCop program running the inspection.
def rubocop_checksum
ResultCache.source_checksum ||=
begin
digest = Digest::SHA1.new
rubocop_extra_features
.select { |path| File.file?(path) }
.sort!
.each do |path|
digest << digest(path)
end
digest << RuboCop::Version::STRING << RuboCop::AST::Version::STRING
digest.hexdigest
end
end
def digest(path)
content = if path.end_with?(*DL_EXTENSIONS)
# Shared libraries often contain timestamps of when
# they were compiled and other non-stable data.
File.basename(path)
else
File.binread(path) # mtime not reliable
end
Zlib.crc32(content).to_s
end
def rubocop_extra_features
lib_root = File.join(File.dirname(__FILE__), '..')
exe_root = File.join(lib_root, '..', 'exe')
# Make sure to use an absolute path to prevent errors on Windows
# when traversing the relative paths with symlinks.
exe_root = File.absolute_path(exe_root)
# These are all the files we have `require`d plus everything in the
# exe directory. A change to any of them could affect the cop output
# so we include them in the cache hash.
source_files = $LOADED_FEATURES + Find.find(exe_root).to_a
source_files -= ResultCache.rubocop_required_features # Rely on gem versions
source_files
end
# Return a hash of the options given at invocation, minus the ones that have
# no effect on which offenses and disabled line ranges are found, and thus
# don't affect caching.
def relevant_options_digest(options)
options = options.reject { |key, _| NON_CHANGING.include?(key) }
options.to_s.gsub(/[^a-z]+/i, '_')
end
# The external dependency checksums are cached per RuboCop team so that
# the checksums don't need to be recomputed for each file.
def team_checksum(team)
@checksum_by_team ||= {}.compare_by_identity
@checksum_by_team[team] ||= team.external_dependency_checksum
end
# We combine team and options into a single "context" checksum to avoid
# making file names that are too long for some filesystems to handle.
# This context is for anything that's not (1) the RuboCop executable
# checksum or (2) the inspected file checksum.
def context_checksum(team, options)
Digest::SHA1.hexdigest([team_checksum(team), relevant_options_digest(options)].join)
end
end
end