#!/usr/bin/env python3 """ dir_compare.py Compare files in two (or more) directories -John Taylor written with Python 3.4.3 on Windows 7 tested on Windows 8.1, OS X 10.10.5 and Ubuntu Linux 14.04 debug reminder: import pdb; pdb.set_trace() todo ------ -x => only compare files with a colon-deliemited list of extensions example: -x .m:.c:.h:.txt -1 => only show files exclusive to d1 -2 => only show files exclusive to d2 improve -s (stats) accuracy """ import os, os.path, time, re, argparse, shutil, timeit, operator import sys, platform, stat import filecmp from datetime import datetime from itertools import zip_longest # displayed when running dir_compare.py -h pgm_version = "4.3" pgm_date = "Nov-2-2017 09:39" ########################################################################################################## # makes it easy to copy/paste a cmd line to compare two non-identical (but similar) files # this value is set by the get_default_cmp_pgm() functions want_cmp_pgm = True str_cmp_pgm = None # hard-coded:start # skip unimportant files - these will not show up in any output, as they are completely skipped # these are regular expressions want_regexpr_skip_filelist = True regexpr_skip_filelist = ( "~$", "\.swp$","^~\$", "\.tmp$", "Thumbs.db", "\.lnk$", "Destinations-ms$" ) compiled_regexpr_skip_filelist = [] # the above want_regexpr_skip_filelist is only used in print_diff() # if you want to exclude these files from print_same(), print_exclusive_d1(), print_exclusive_d2() # then set this to True (the default is False) want_global_skip_filelist = False # skip unimportant directories - these will not show up in any output, as they are completely skipped # these are regular expressions want_regexpr_skip_dirlist = True regexpr_skip_dirlist = ( "cache", "recent", "cookies", "\$recycle.bin" ) compiled_regexpr_skip_dirlist = [] # if want_cmp_pgm is true and cmp_min_ratio > 0, then only show the str_cmp_pgm line for files with # ratios >= cmp_min_ratio # in other words do not show a str_cmp_pgm for files that are not anywhere similar # a number closer to 100 means two files are very similar ###cmp_min_ratio = 70.00 # skip difference computation for certain file extensions want_skip_diff_list = True skip_diff_list = (".pst", ".tmp" ) # some types of files are VERY slow when using the standard difflib ratio() function to compute file similarity # therefore use the real_quick_ratio() function instead want_real_quick_diff_list = True real_quick_diff_list = (".min.css", ".min.js" ) # do not run similarity computation on file greater than this size # 1 MB = 1 * (1024 * 1024) # default is 20 MB ###want_file_size_diff_limit = True ###file_size_diff_limit = 20 * ( 1024 * 1024 ) # hard-coded:end # verbose: print directory names to STDERR # controlled by -v cmd line option # Example: dir_compare.py -r alpha beta > output.log # this will display (to the terminal) each directory group as they are being compared want_verbose_dir_print = False # if False, file contents will be compared, not just their metadata # controlled by -c cmd line option shallow_cmp = True # only show files that have the same file size and timestamp # controlled by -i cmd line option only_show_same = False # statistics summary # controlled by -s cmd line option count_same_files = 0 count_diff_files = 0 count_unequal_files = 0 count_same_contents_files = 0 count_exclusive_d1 = 0 count_exclusive_d2 = 0 skipped_files = [] skipped_directories = [] # time how long it takes for SequenceMatcher.ratio() to run # displayed after all files have been compared when using the -S option elapsed_ratio_time = {} # output difference to html files # controlled by -H cmd line option html_output_dir = None # Output format type # controlled by -t cmd line option tab_file = None ofmt_delim = "\t" date_time_fmt = "%a %b %d %H:%M:%S %Y" """ example csv output: comparison_value,dname1,dname2,fname,ratio,fsize1,fsize2,fsize_diff,date1,date2,date_diff all possible comparison values: samemeta,samemeta_diffdata,idential,different,exclusive_d1,exclusive_d2 identical only occurs with -c (compare contents & metadata) """ ########################################################################################################## textchars = bytearray([0,7,8,9,10,12,13,27]) + bytearray(range(0x20, 0x100)) textdict = dict(zip_longest(textchars,[''],fillvalue='')) is_binary_string = lambda data: True if not len(data) else bool(data.translate(textdict)) ########################################################################################################## def file_cmp_shallow(fname1:str, fname2:str) -> bool: """Compare mode, size and mod time """ st1 = os.stat(fname1) st2 = os.stat(fname2) print(st1.st_mode,st1.st_size,st1.st_mtime) print(stat.S_IFMT(st1.st_mode)) return True if st1.st_mode==st2.st_mode and st1.st_size==st2.st_size and st1.st_mtime==st2.st_mtime else False ############################################################################# def file_cmp_exact(fname1:str, fname2:str) -> bool: """Compare two files byte for byte """ BUFSIZE=1024*1024 #if not file_cmp_shallow(fname1, fname2): # return False bufsize = BUFSIZE with open(fname1, 'rb') as fp1, open(fname2, 'rb') as fp2: while True: b1 = fp1.read(bufsize) b2 = fp2.read(bufsize) if b1 != b2: return False if not b1: return True ############################################################################# def get_default_cmp_pgm(): global str_cmp_pgm if "Windows" == platform.system(): candidates = ( "C:\\Program Files\\WinMerge\\WinMergeU.exe", "C:\\Program Files (x86)\\WinMerge\\WinMergeU.exe", "c:\Program Files\KDiff3\kdiff3.exe", "c:\Program Files (x86)\KDiff3\kdiff3.exe" ) for c in candidates: if os.path.exists(c): str_cmp_pgm = c break elif "Linux" == platform.system(): candidates = ( "/usr/bin/kdiff3", "/usr/bin/diff" ) for c in candidates: if os.path.exists(c): str_cmp_pgm = c break elif "Darwin" == platform.system(): candidates = ( "/usr/bin/opendiff", "/Applications/kdiff3.app/Contents/MacOS/kdiff3", "/usr/bin/diff" ) for c in candidates: if os.path.exists(c): str_cmp_pgm = c break else: candidates = ( "/usr/bin/diff", "/usr/local/bin/diff" ) # instead of displaying the full path, only display the pgm name if it resides on the OS shell's path if str_cmp_pgm: base = os.path.basename(str_cmp_pgm) if shutil.which(base): str_cmp_pgm = base if str_cmp_pgm.find(" ") >= 0: str_cmp_pgm = '"%s"' % (str_cmp_pgm) ########################################################################################################## def process_directories(d1,d2,diff_only=False,recurse=False): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories abort = False if want_regexpr_skip_dirlist and within_regexpr_skip_dirlist(d1): abort = d1 elif want_regexpr_skip_dirlist and within_regexpr_skip_dirlist(d2): abort = d2 if abort: skipped_directories.append(abort) safe_print() safe_print("Directory excluded by reg expr skip list: %s" % (abort)) safe_print() return if not os.path.exists(d1): safe_print() safe_print() safe_print("Directory path does not exist: %s" % (d1)) safe_print() sys.exit(1) if not os.path.exists(d2): safe_print() safe_print() safe_print("Directory path does not exist: %s" % (d2)) safe_print() sys.exit(1) if not os.path.isdir(d1): safe_print() safe_print() safe_print("Not a directory: %s" % (d1)) safe_print() sys.exit(1) if not os.path.isdir(d2): safe_print() safe_print() safe_print("Not a directory: %s" % (d2)) safe_print() sys.exit(1) if d1 == d2: safe_print() safe_print() safe_print("Identical directories given as parameters.") safe_print() sys.exit(1) if want_regexpr_skip_filelist: for r in regexpr_skip_filelist: compiled_regexpr_skip_filelist.append( re.compile(r,re.I)) if want_regexpr_skip_dirlist: for r in regexpr_skip_dirlist: compiled_regexpr_skip_dirlist.append( re.compile(r,re.I)) meta = filecmp.dircmp(d1, d2) dest = sys.stdout if recurse: for i in range(0,4): safe_print("",outfile=dest) safe_print("=" * 135,outfile=dest) safe_print("directory 1: %s" % (d1), outfile=dest) safe_print("directory 2: %s" % (d2), outfile=dest) safe_print("=" * 135, outfile=dest) safe_print(outfile=dest) if want_verbose_dir_print: dest = sys.stderr for i in range(0,4): safe_print(outfile=dest) safe_print("=" * 135,outfile=dest) safe_print("directory 1: %s" % (d1), outfile=dest) safe_print("directory 2: %s" % (d2), outfile=dest) safe_print("=" * 135, outfile=dest) safe_print(outfile=dest) if diff_only or not only_show_same: same_contents = print_differ(meta,d1,d2) if same_contents: print_same_contents(same_contents) if not diff_only: unequal = print_same(meta,d1,d2) if unequal: print_unequal( unequal ) if not only_show_same: #print_exclusive_d1(meta,d1,d2) #print_exclusive_d2(meta,d1,d2) print_exclusive(meta,"d1",d1) print_exclusive(meta,"d2",d2) if recurse: for i in range(0,4): safe_print() ########################################################################################################## def safe_print(data="",outfile=sys.stdout): # can also use 'replace' instead of 'ignore' for errors= parameter print( str(data).encode(sys.stdout.encoding, errors='ignore').decode(sys.stdout.encoding), file=outfile ) ########################################################################################################## def find_common(a0,b0): a = a0[::-1] b = b0[::-1] max=len(a) for i in range(0,max): if a[0:i] != b[0:i]: break for j in range(i,0,-1): if a[j] == os.sep and b[j] == os.sep: break tmp = a[:j] new_a = tmp[::-1] tmp = b[:j] new_b = tmp[::-1] if(new_a != new_b): return os.path.basename(new_a) else: return new_a ########################################################################################################## def within_regexpr_skip_filelist(fname): # set dbg to a filename fragment (or an entire filename) to debug which files are being skipped dbg = False for r in compiled_regexpr_skip_filelist: match = r.findall(fname) if len(match): if dbg: safe_print(":: fname skip positive: %s %s" % (fname, match)) return True if dbg: safe_print(":: fname skip negative: %s %s" % (fname, match)) return False ########################################################################################################## def within_regexpr_skip_dirlist(dname): for r in compiled_regexpr_skip_dirlist: match = r.findall(dname) if len(match): return True return False ########################################################################################################## def print_listing(meta,root=True): if root: quote_left = '"%s"' % (meta.left) quote_right = '"%s"' % (meta.right) msg = "%s %s %s" % (os.path.basename(sys.argv[0]),quote_left,quote_right) safe_print(msg) for key in sorted(meta.subdirs.keys()): sub = meta.subdirs[key] quote_left = '"%s"' % (sub.left) quote_right = '"%s"' % (sub.right) msg = "%s %s %s" % (os.path.basename(sys.argv[0]),quote_left,quote_right) safe_print(msg) print_listing( filecmp.dircmp(sub.left,sub.right), False ) ########################################################################################################## def recurse_directories(meta, diff_only): for key in sorted(meta.subdirs.keys()): sub = meta.subdirs[key] process_directories( sub.left, sub.right, diff_only=diff_only, recurse=True ) recurse_directories( filecmp.dircmp(sub.left,sub.right), diff_only ) ########################################################################################################## def print_unequal(unequal): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories global want_file_size_diff_limit, file_size_diff_limit, want_skip_diff_list, tab_file safe_print() safe_print("-" * 135) safe_print(" " * 40 + "files contents are unequal, but metatdata is the same") safe_print("-" * 135) safe_print() safe_print("%67s %10s %24s" % ("fname", "size", "date")) safe_print("%67s %10s %24s" % ("="*33, "="*10, "="*24)) for grp in unequal: count_unequal_files += 1 file1 = grp[0] file2 = grp[1] a = os.stat(file1) b = os.stat(file2) tmp=time.localtime(a.st_mtime) g = time.asctime(tmp) tmp=time.localtime(b.st_mtime) h = time.asctime(tmp) safe_print("%67s %10s %24s" % (make_ellipses(file1,67), a.st_size, g)) safe_print("%67s %10s %24s" % (make_ellipses(file2,67), a.st_size, h)) if len(unequal) > 1: safe_print("%67s %10s %24s" % ("."*33, "."*9,"."*24)) if tab_file: dirname1 = os.path.dirname(file1) dirname2 = os.path.dirname(file2) basename1 = os.path.basename(file1) basename2 = os.path.basename(file2) basename = basename1 if basename1 == basename2 else "????" save_tab_file("samemeta_diffdata",dirname1,dirname2,basename,a.st_size,b.st_size,g,h) safe_print() safe_print() ########################################################################################################## def print_same_contents(same): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories global count_same_contents_files global want_file_size_diff_limit, file_size_diff_limit, want_skip_diff_list, tab_file safe_print() safe_print("-" * 135) safe_print(" " * 40 + "files contents are identical, metadata may be different") safe_print("-" * 135) safe_print() safe_print("%67s %10s %24s" % ("fname", "size", "date")) safe_print("%67s %10s %24s" % ("="*33, "="*10, "="*24)) for grp in same: count_same_contents_files += 1 file1 = grp[0] file2 = grp[1] a = os.stat(file1) b = os.stat(file2) tmp=time.localtime(a.st_mtime) g = time.asctime(tmp) tmp=time.localtime(b.st_mtime) h = time.asctime(tmp) safe_print("%67s %10s %24s" % (make_ellipses(file1,67), a.st_size, g)) safe_print("%67s %10s %24s" % (make_ellipses(file2,67), a.st_size, h)) safe_print("%67s %10s %24s" % ("."*33, "."*9,"."*24)) if tab_file: dirname1 = os.path.dirname(file1) dirname2 = os.path.dirname(file2) basename1 = os.path.basename(file1) basename2 = os.path.basename(file2) basename = basename1 if basename1 == basename2 else "????" save_tab_file("same_content",dirname1,dirname2,basename,a.st_size,b.st_size,g,h) safe_print() ########################################################################################################## def print_differ(meta,d1,d2): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories, tab_file if not len(meta.diff_files): safe_print() safe_print("-" * 135) safe_print(" " * 40 + "there are no differing files") safe_print("-" * 135) return for i in range(0,6): safe_print() safe_print("-" * 135) if shallow_cmp: safe_print(" " * 40 + "files that differ [%s] (star denotes newer or larger file; higher ratio denotes more similarity)" % len(meta.diff_files)) else: safe_print(" " * 40 + "files that differ (star denotes newer or larger file; higher ratio denotes more similarity)") safe_print("-" * 135) safe_print() safe_print("%67s %10s %10s %24s %24s %6s" % ("fname", "size-1", "size-2", "date-1", "date-2","ratio")) safe_print("%67s %10s %10s %24s %24s %6s" % ("="*33, "="*10, "="*10, "="*24, "="*24, "="*6)) cmp_results = [] actually_same_contents = [] files_processed = 0 for f in sorted(meta.diff_files): if want_regexpr_skip_filelist and within_regexpr_skip_filelist(f): skipped_files.append( "%s%s%s" % (d1,os.sep,f) ) continue count_diff_files += 1 file1 = "%s%s%s" % (d1,os.sep,f) file2 = "%s%s%s" % (d2,os.sep,f) try: a = os.stat(file1) b = os.stat(file2) x=" " y=" " if a.st_size > b.st_size: x="*" y=" " elif a.st_size < b.st_size: x=" " y="*" j=" " k=" " if a.st_mtime > b.st_mtime: j="*" k=" " elif a.st_mtime < b.st_mtime: j=" " k="*" tmp=time.localtime(a.st_mtime) g = time.asctime(tmp) tmp=time.localtime(b.st_mtime) h = time.asctime(tmp) except OSError as err: dest=sys.stderr safe_print("Error #6821 - error while processing file in print_differ()", outfile=dest) safe_print(err,outfile=dest) safe_print("",outfile=dest) continue files_processed += 1 if not shallow_cmp: f1 = "%s%s%s" % (d1,os.sep,f) f2 = "%s%s%s" % (d2,os.sep,f) identical = file_cmp_shallow(f1,f2) #identical = DC_cmp(f1,f2,shallow=False) if identical: actually_same_contents.append( (f1,f2)) continue safe_print("%67s %10s%s %10s%s %24s%s %24s%s" % (make_ellipses(f,67), a.st_size, x, b.st_size, y, g,j, h,k)) if tab_file: save_tab_file("different",d1,d2,f,a.st_size,b.st_size,g,h) if want_cmp_pgm and cmp_min_ratio and str_cmp_pgm: if ratio >= cmp_min_ratio - 0.01: entry = '%s "%s%s%s" "%s%s%s"' % (str_cmp_pgm,d1,os.sep,f,d2,os.sep,f) cmp_results.append(entry) if html_output_dir: html = difflib.HtmlDiff(tabsize=4,wrapcolumn=65) valid_read = True try: with open(file1,"r") as fp: file1_data = fp.readlines() with open(file2,"r") as fp: file2_data = fp.readlines() except: #print("Unexpected error #7092:", sys.exc_info()[0]) file1_data = [] file2_data = [] valid_read = False if not len(file1_data): valid_read = False if not len(file2_data): valid_read = False if valid_read and not is_binary_string(file1_data[0]) and not is_binary_string(file1_data[len(file1_data)-2]): try: diff = html.make_file(file1_data,file2_data,"dir 1","dir 2",True) except RuntimeError as err : dest=sys.stderr tmp1 = "%s%s%s" % (d1,os.sep,f) tmp2 = "%s%s%s" % (d2,os.sep,f) safe_print("Error #5395 - unable to create HTML diff file between:", outfile=dest) safe_print(" file1: %s" % (tmp1)) safe_print(" file2: %s" % (tmp2)) safe_print(err,outfile=dest) safe_print("",outfile=dest) else: common_name = find_common(file1,file2) html_fname = "%s%s%s.html" % (html_output_dir,os.sep,common_name) rootdir = os.path.dirname(html_fname) #safe_print("html_output_dir: %s html_fname: %s rootdir: %s" % (html_output_dir, html_fname,rootdir)) try: os.makedirs(rootdir,mode=0o777,exist_ok=True) except OSError as err: dest=sys.stderr safe_print("Error #4602 - error while creating directory: %s" % (html_output_dir), outfile=dest) safe_print(err,outfile=dest) safe_print("",outfile=dest) continue except: print("Unexpected error #2183:", sys.exc_info()[0]) fp = open(html_fname,mode="w") fp.write(diff) fp.close() if not files_processed: safe_print("%67s" % ("All files were excluded by the file skip list regular expression.")) if want_cmp_pgm and len(cmp_results): safe_print() safe_print("-" * 135) safe_print(" " * 40 + "command-line file compare (ratio >= %4.2f%%)" % (cmp_min_ratio)) safe_print("-" * 135) safe_print() for entry in cmp_results: safe_print(entry) return actually_same_contents if len(actually_same_contents) else False ########################################################################################################## def print_same(meta,d1,d2): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories, tab_file actually_different = [] if not len(meta.same_files): safe_print() safe_print("-" * 135) safe_print(" " * 40 + "there are no matching files") safe_print("-" * 135) return safe_print() safe_print() safe_print() safe_print() safe_print("-" * 135) safe_print(" " * 37 + "files that are the same [%s]" % len(meta.same_files)) safe_print("-" * 135) safe_print("%67s %10s %24s" % ("fname", "size", "date")) safe_print("%67s %10s %24s" % ("="*33, "="*10, "="*24)) for f in sorted(meta.same_files): if want_regexpr_skip_filelist and want_global_skip_filelist and within_regexpr_skip_filelist(f): skipped_files.append( "%s%s%s" % (d1,os.sep,f) ) continue count_same_files += 1 if not shallow_cmp: f1 = "%s%s%s" % (d1,os.sep,f) f2 = "%s%s%s" % (d2,os.sep,f) identical = file_cmp_shallow(f1,f2,shallow=False) #identical = DC_cmp(f1,f2,shallow=False) if not identical: actually_different.append( (f1,f2)) continue try: a = os.stat("%s%s%s" % (d1,os.sep,f)) b = os.stat("%s%s%s" % (d2,os.sep,f)) tmp=time.localtime(b.st_mtime) q = time.asctime(tmp) except OSError as err: dest=sys.stderr safe_print("Error #9724 - error while processing file in print_same()", outfile=dest) safe_print(err,outfile=dest) safe_print("",outfile=dest) q = "????" safe_print("%67s %10s %24s" % (make_ellipses(f,67), a.st_size, q)) if tab_file: comparison_value = "samemeta" if shallow_cmp else "identical" save_tab_file(comparison_value,d1,d2,f,a.st_size,b.st_size,q,q) safe_print() safe_print() return actually_different if len(actually_different) else False ########################################################################################################## def print_exclusive(meta,dname,d0): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories, tab_file metadir = meta.left_only if "d1" == dname else meta.right_only if not len(metadir): safe_print() safe_print("-" * 135) safe_print(" " * 30 + "there are no files exclusively in: %s" % (d0)) safe_print("-" * 135) return for i in range(0,4): safe_print(outfile=sys.stdout) safe_print("-" * 135) safe_print(" " * 30 + "files exclusively in [%s]: %s" % (len(metadir),d0)) safe_print("-" * 135) safe_print("%67s %10s %24s" % ("fname", "size", "date")) safe_print("%67s %10s %24s" % ("="*33, "="*10, "="*24)) for f in sorted(metadir): if want_regexpr_skip_filelist and want_global_skip_filelist and within_regexpr_skip_filelist(f): skipped_files.append( "%s%s%s" % (d0,os.sep,f) ) continue if "d1" == dname: count_exclusive_d1 += 1 else: count_exclusive_d2 += 1 a = os.stat("%s%s%s" % (d0,os.sep,f)) tmp=time.localtime(a.st_mtime) q = time.asctime(tmp) safe_print("%67s %10s %24s" % (make_ellipses(f,67), a.st_size, q)) if tab_file: if "d1" == dname: save_tab_file("exclusive_d1",d0,"",f,a.st_size,"",q,"") else: save_tab_file("exclusive_d2",d0,"",f,a.st_size,"",q,"") safe_print(); safe_print() ########################################################################################################## def make_ellipses(fname, sz): w = len(fname) if w <= sz: return fname #segment = floor(sz/2) segment = sz // 2 segment -= 1 return "%s...%s" % (fname[0:segment],fname[ (w-segment):]) ########################################################################################################## def file_in_skip_diff_list(f1,f2): for tmp in skip_diff_list: ext = tmp.lower() w = len(ext) * -1 if f1[w:].lower() == ext and f2[w:].lower() == ext: return True return False ########################################################################################################## def print_totals(detailed=False, identical=False): global count_same_files, count_diff_files, count_unequal_files, count_exclusive_d1, count_exclusive_d2, skipped_files, skipped_directories, elapsed_ratio_time global count_same_contents_files dest = sys.stderr for i in range(0,4): safe_print(outfile=dest) safe_print("=" * 135,outfile=dest) safe_print("%67s" % ("statistical totals"), outfile=dest) safe_print("=" * 135, outfile=dest) for i in range(0,2): safe_print(outfile=dest) desc = "identical" if identical else "same file metadata" safe_print("%40s %s" % ("%s:" % (desc), (count_same_files-count_unequal_files)), outfile=dest) safe_print("%40s %s" % ("different files:", count_diff_files), outfile=dest) safe_print("%40s %s" % ("same metadata, different data:", count_unequal_files), outfile=dest) safe_print("%40s %s" % ("same contents, possibly different metadata:", count_same_contents_files), outfile=dest) safe_print("%40s %s" % ("exclusive to directory 1:", count_exclusive_d1), outfile=dest) safe_print("%40s %s" % ("exclusive to directory 2:", count_exclusive_d2), outfile=dest) safe_print("%40s %s" % ("skipped files (via reg expr):", len(skipped_files)), outfile=dest) safe_print("%40s %s" % ("skipped directories (via reg expr):", len(skipped_directories)), outfile=dest) for i in range(0,2): safe_print(outfile=dest) if not detailed: return if len(skipped_files): safe_print("skipped files", outfile=dest) safe_print("="*13, outfile=dest) safe_print(skipped_files, outfile=dest) for i in range(0,2): safe_print(outfile=dest) if len(skipped_directories): safe_print("skipped directories", outfile=dest) safe_print("="*19, outfile=dest) safe_print(skipped_directories, outfile=dest) for i in range(0,2): safe_print(outfile=dest) if len(elapsed_ratio_time): safe_print("elapsed time for file comparison ratios", outfile=dest) safe_print("="*39, outfile=dest) sorted_elap = sorted(elapsed_ratio_time.items(), key=operator.itemgetter(1),reverse=True) for entry in sorted_elap: safe_print("[%05.2f] %s" % (entry[1], entry[0]), outfile=dest) for i in range(0,2): safe_print(outfile=dest) ########################################################################################################## def init_tab_file(fname): global tab_file, ofmt_delim tab_file = fname header = ( "comparison", "dname1", "dname2", "fname", "ratio", "fsize1", "fsize2", "fsize2 - fsize1", "date1", "date2", "date2 - date1 (d:h:m:s)" ) try: with open(tab_file,mode="w",encoding="latin-1") as fp: entry = ofmt_delim.join(header) fp.write("%s\n" % (entry)) except OSError as err: dest=sys.stderr safe_print("",outfile=dest) safe_print("Error #5086 - unable to open file for writing: %s" % (fname), outfile=dest) safe_print("",outfile=dest) safe_print(err,outfile=dest) safe_print("",outfile=dest) sys.exit(1) ########################################################################################################## # output file format: # comparision-type,dname1,dname2,fname,fsize1,fsize2,fsize_diff,date1,fdate2,date_diff def save_tab_file(comparison, dname1, dname2, fname, fsize1, fsize2, date1, date2): global tab_file, ofmt_delim, date_time_fmt if len(dname1): d1 = datetime.strptime(date1, date_time_fmt) else: fsize_diff = "" date_diff = "" if len(dname2): d2 = datetime.strptime(date2, date_time_fmt) else: fsize_diff = "" date_diff = "" if len(dname1) and len(dname2): fsize_diff = fsize2 - fsize1 op = "" if d2.timestamp() >= d1.timestamp() else "-" # http://stackoverflow.com/a/2119509/452281 tdel = abs( (d2-d1) ) days, hours, minutes = ( tdel.days, tdel.seconds//3600, (tdel.seconds//60)%60 ) # http://stackoverflow.com/a/14190143/452281 seconds = int( tdel.total_seconds() % 60 ) date_diff = "%s%04d:%02d:%02d:%02d" % (op,days, hours, minutes, seconds) with open(tab_file,mode="a",encoding="latin-1") as fp: entry = ofmt_delim.join((comparison,dname1,dname2,fname, "%s" % (fsize1), "%s" % (fsize2), "%s" % (fsize_diff), date1, date2, date_diff)) fp.write("%s\n" % (entry)) ########################################################################################################## def print_hard_coded(fname): hard_re = re.compile("# hard-coded:start(.*?)# hard-coded:end",re.S|re.M) with open(fname,mode="r",encoding="latin-1") as fp: data=fp.read() match = hard_re.findall(data) if not match: return for line in match[:-1]: safe_print(line) ########################################################################################################## def main(): global want_verbose_dir_print, shallow_cmp, str_cmp_pgm, only_show_same, html_output_dir, want_ratio_computation parser = argparse.ArgumentParser(description="Compare files in two directories", epilog="version: %s (%s)" % (pgm_version,pgm_date)) parser.add_argument("dname1", help="first directory to compare") parser.add_argument("dname2", help="second directory to compare") group1 = parser.add_mutually_exclusive_group() group1.add_argument("-r", "--recurse", help="recusively view file differences in directories", action="store_true") group1.add_argument("-o", "--options", help="print hard-coded options & values", action="store_true") group2 = parser.add_mutually_exclusive_group() group2.add_argument("-d", "--diffonly", help="only show files that are different", action="store_true") group2.add_argument("-i", "--identical", help="only show files that have the same metadata",action="store_true") group2.add_argument("-1", "--one", help="only show files exclusive to dname1",action="store_true") group2.add_argument("-2", "--two", help="only show files exclusive to dname2",action="store_true") parser.add_argument("-c", "--contents", help="compare contents of the files, not just metadata", action="store_true") parser.add_argument("-p", "--pgm", help="use PGM as your comparision program") parser.add_argument("-H", "--hdir", help="output differences to HTML files using HDIR directory") parser.add_argument("-t", "--tabfile", help="also save tab-delimited results to TABFILE file") parser.add_argument("-v", "--verbose", help="print directories being compared to STDERR", action="store_true") parser.add_argument("-s", "--stats", help="print statistical totals to STDERR", action="store_true") parser.add_argument("-S", "--morestats", help="print even more detailed statistical totals to STDERR", action="store_true") args = parser.parse_args() if args.options: return print_hard_coded(sys.argv[0]) if args.verbose: want_verbose_dir_print = True if args.contents: shallow_cmp = False if args.identical: only_show_same = True if args.hdir: html_output_dir = args.hdir if args.pgm: str_cmp_pgm = args.pgm if -1 == str_cmp_pgm.find('"') and str_cmp_pgm.find(" ") >= 0: str_cmp_pgm = '"%s"' % (str_cmp_pgm) elif want_cmp_pgm: get_default_cmp_pgm() if args.tabfile: init_tab_file(args.tabfile) if not args.recurse: process_directories( args.dname1, args.dname2, diff_only=args.diffonly, recurse=False ) """ else: process_directories( args.dname1, args.dname2, diff_only=args.diffonly, recurse=True ) meta = filecmp.dircmp(args.dname1, args.dname2) recurse_directories(meta, args.diffonly) """ if args.stats: print_totals(False,args.contents) if args.morestats: print_totals(True,args.contents) return 0 ########################################################################################################## if __name__ == "__main__": try: rv = main() except KeyboardInterrupt: rv=130 sys.exit(rv) # End of Script