#!/usr/bin/ruby require 'date' require 'optparse' # always keep the last 4 weeks ALWAYS_KEEP_DAYS = 4 * 7 # otherwise, bucket by month and keep the earliest in the bucket def bucket(date) Date.new(date.year, date.month, 1) end Candidate = Struct.new(:filename, :date) def list_files(glob, date_pattern) # find all candidates for deletion real_files = Dir.glob(glob).select do |file| File.file?(file) && !File.symlink?(file) end real_files.map do |file| # extract the date m = date_pattern.match(file) raise "Unable to extract date string from #{file.inspect}" if m.nil? d = Date.strptime(m[1], "%y%m%d") Candidate.new(file, d) end end def deletion_candidates(today, candidates) candidate_buckets = Hash.new candidates.each do |c| next if today - c.date < ALWAYS_KEEP_DAYS b = bucket(c.date) candidate_buckets[b] = Array.new unless candidate_buckets.has_key?(b) candidate_buckets[b] << c end # delete all but the earliest in each bucket candidate_buckets.collect_concat do |bucket, contents| contents.sort_by {|c| c.date}[1..-1] end end def deletions(glob, date_pattern, today, expansions) candidates = list_files(glob, date_pattern) to_delete = deletion_candidates(today, candidates) expanded = to_delete.collect_concat do |candidate| dir = File.dirname(candidate.filename) expansions.map do |e| exp = candidate.date.strftime(e) "#{dir}/#{exp}" end end expanded.select {|e| File.exist?(e)} end dry_run = false debug = false OptionParser.new do |opt| opt.on('--dry-run') { dry_run = true } opt.on('--debug') { debug = true } end.parse! xml_directory = "<%= node[:planet][:dump][:xml_directory] %>" xml_history_directory = "<%= node[:planet][:dump][:xml_history_directory] %>" pbf_directory = "<%= node[:planet][:dump][:pbf_directory] %>" pbf_history_directory = "<%= node[:planet][:dump][:pbf_history_directory] %>" today = Date.today to_delete = Array.new to_delete += deletions( "#{xml_directory}/20??/planet-??????.osm.bz2", /planet-([0-9]{6}).osm.bz2/, today, ["changesets-%y%m%d.osm.bz2", "changesets-%y%m%d.osm.bz2.md5", "discussions-%y%m%d.osm.bz2", "discussions-%y%m%d.osm.bz2.md5", "planet-%y%m%d.osm.bz2", "planet-%y%m%d.osm.bz2.md5"]) to_delete += deletions( "#{xml_history_directory}/20??/history-??????.osm.bz2", /history-([0-9]{6}).osm.bz2/, today, ["history-%y%m%d.osm.bz2", "history-%y%m%d.osm.bz2.md5"]) to_delete += deletions( "#{pbf_directory}/planet-??????.osm.pbf", /planet-([0-9]{6}).osm.pbf/, today, ["planet-%y%m%d.osm.pbf", "planet-%y%m%d.osm.pbf.md5"]) to_delete += deletions( "#{pbf_history_directory}/history-??????.osm.pbf", /history-([0-9]{6}).osm.pbf/, today, ["history-%y%m%d.osm.pbf", "history-%y%m%d.osm.pbf.md5"]) total_size = 0 num_deleted = 0 cmd = dry_run ? "Would delete" : "Deleted" to_delete.each do |file| s = File.stat(file) File.delete(file) unless dry_run puts "#{cmd} #{file.inspect}, #{s.size / 1000000} MB" if debug total_size += s.size num_deleted += 1 end if debug puts "#{cmd} files of total size #{total_size / 1000000000.0} GB" puts "#{cmd} #{num_deleted} files" end