From e1d6490a6bfaaa959b975ea8f0532c131d8951bc Mon Sep 17 00:00:00 2001 From: sesas Date: Mon, 26 Mar 2012 17:04:35 -0700 Subject: [PATCH 1/9] modified UFDFF.py to work with Python3000. Signed-off-by: sesas --- UltraFastDuplicateFilesFinder.py | 89 +++++++++++++++++--------------- fileWalker.py | 3 ++ 2 files changed, 49 insertions(+), 43 deletions(-) create mode 100644 fileWalker.py diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py index 8e7fe78..a7b479f 100644 --- a/UltraFastDuplicateFilesFinder.py +++ b/UltraFastDuplicateFilesFinder.py @@ -105,46 +105,49 @@ def humanize_size(size): if hsize > 0.5: return '%.2f %s' % (hsize, suffix) - -# we start here by checking all files -for filename in sys.stdin: - filename = filename.strip() - - check_file(filename) - totalfiles += 1 - totalsize += os.path.getsize(filename) - -# print the report -print '%10s %s' % ('size', 'filename') - -for h, f in hashlist.iteritems(): - if hashcount[h] < 2: - # present one time, skip - continue - - # reference file - refsize = os.path.getsize(f[0]) - refmd5 = get_file_hash(f[0]) - print '%10d %s' % (refsize, f[0]) - - - for filename in f[1:]: - # and its copies - size = os.path.getsize(filename) - md5 = get_file_hash(filename) - - status = ' ' - msg = '' - if md5 != refmd5: - status = '!' - msg = ' partial match only!' - - print '%10d %s %s%s' % (size, status, filename, msg) - dupsize += size - dupfiles += 1 - print - -# final summary -print '%d files checked (%s), %d duplicates (%s).' % ( - totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize)) - +def main(): + # we start here by checking all files + for filename in sys.stdin: + filename = filename.strip() + + check_file(filename) + totalfiles += 1 + totalsize += os.path.getsize(filename) + + # print the report + print( '%10s %s' % ('size', 'filename') ) + + for h, f in hashlist.iteritems(): + if hashcount[h] < 2: + # present one time, skip + continue + + # reference file + refsize = os.path.getsize(f[0]) + refmd5 = get_file_hash(f[0]) + print( '%10d %s' % (refsize, f[0])) + + + for filename in f[1:]: + # and its copies + size = os.path.getsize(filename) + md5 = get_file_hash(filename) + + status = ' ' + msg = '' + if md5 != refmd5: + status = '!' + msg = ' partial match only!' + + print( '%10d %s %s%s' % (size, status, filename, msg)) + dupsize += size + dupfiles += 1 + print() + + # final summary + print( '%d files checked (%s), %d duplicates (%s).' % ( + totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize))) + + +if __name__ == '__main__': + main() diff --git a/fileWalker.py b/fileWalker.py new file mode 100644 index 0000000..8a43d6c --- /dev/null +++ b/fileWalker.py @@ -0,0 +1,3 @@ +import os, sys +import UltraFastDuplicateFilesFinder as ff + From 5ec3b17390d938014e0baeb2e6b90afbad510967 Mon Sep 17 00:00:00 2001 From: sesas Date: Mon, 26 Mar 2012 17:24:18 -0700 Subject: [PATCH 2/9] added a test folder, dirName input and file walker creation. Signed-off-by: sesas --- fileWalker.py | 37 +++++++++++++++++++++++++++++++++++ test/README | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ test/README_2 | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ test/README_3 | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 test/README create mode 100644 test/README_2 create mode 100644 test/README_3 diff --git a/fileWalker.py b/fileWalker.py index 8a43d6c..28de5ca 100644 --- a/fileWalker.py +++ b/fileWalker.py @@ -1,3 +1,40 @@ import os, sys import UltraFastDuplicateFilesFinder as ff +testPath = os.path.join( os.path.curdir ) + + + + + + + +def getDirName(dirName=None): + if dirName: + if os.path.isdir( os.path.normpath( dirName )): + return dirName + if sys.argv[1:]: + out = sys.argv[1] + if os.path.isdir( os.path.normpath( out )): + return out + + while 1: + inp = input("which folder would you like to find the duplicates in?\n") + if not inp: # mainly for debugging + global testPath + inp = testPath + break + if os.path.isdir( os.path.normpath( inp )): + break + return inp + + +def main(dirName=None): + root = getDirName(dirName) + walker = os.walk(root) + + for j in walker: + print(j) + +if __name__ == '__main__': + main() diff --git a/test/README b/test/README new file mode 100644 index 0000000..07a6777 --- /dev/null +++ b/test/README @@ -0,0 +1,53 @@ +Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet + + +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + + diff --git a/test/README_2 b/test/README_2 new file mode 100644 index 0000000..07a6777 --- /dev/null +++ b/test/README_2 @@ -0,0 +1,53 @@ +Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet + + +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + + diff --git a/test/README_3 b/test/README_3 new file mode 100644 index 0000000..07a6777 --- /dev/null +++ b/test/README_3 @@ -0,0 +1,53 @@ +Ultra Fast Duplicate Files Finder +================================= + by Gautier Portet + + +Takes a list of file from stdin. +And print the duplicate ones. + + +example use: + + find ~/ -size +10M | ./UltraFastDuplicateFilesFinder.py + +to find duplicates in your home folder, all files more than 10MB. + +UltraFastDuplicateFilesFinder compares only the very beginning of the files. +Its sufficient for most uses, but use with caution. + +But this way is quite useful to detect duplicates within corrupted media files... + + +this is public domain. + + + +------------------------------------------------------------------------------ +example run, took less than a second to answer... + + +gautier@quad:~/code/tmp$ find /home -size +10M | ./duplicate.py + size filename + 12467906 /home/gautier/Photos/pict4614.mov + 12467906 /home/gautier/Photos/Videos/PICT4614.MOV + + 13068570 /home/gautier/Photos/pict4588.mov + 13068570 /home/gautier/Photos/Videos/PICT4588.MOV + +[...] + + 20865498 /home/gautier/Photos/pict4695.mov + 20865498 /home/gautier/Photos/Videos/PICT4695.MOV + + 28270824 /home/gautier/tmp/tsunami 1 œ ǒǑ.flac + 28270824 /home/gautier/tmp/tsunami-1.flac + +136 files checked (22.75 GiB), 8 duplicates (153.45 MiB). + + + + + + + From 10e5cd893d181a4c8f34d4f822c28a9deaeff0df Mon Sep 17 00:00:00 2001 From: sesas Date: Tue, 27 Mar 2012 18:13:10 -0700 Subject: [PATCH 3/9] made the new module interface nicely with the original module to get the hashlist back and made optional filter for hidden folders. Signed-off-by: sesas --- UltraFastDuplicateFilesFinder.py | 11 ++++++----- fileWalker.py | 34 ++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py index a7b479f..1850d16 100644 --- a/UltraFastDuplicateFilesFinder.py +++ b/UltraFastDuplicateFilesFinder.py @@ -46,7 +46,7 @@ def get_file_hash(filename, limit_size=None, buffer_size=BUFFER_SIZE): """ # open file try: - f = file(filename, "rb") + f = open(filename, "rb") except IOError: return 'NONE' @@ -105,9 +105,10 @@ def humanize_size(size): if hsize > 0.5: return '%.2f %s' % (hsize, suffix) -def main(): +def main(dirWalker=sys.stdin): + global totalsize, totalfiles, dupfiles, dupsize # we start here by checking all files - for filename in sys.stdin: + for filename in dirWalker: filename = filename.strip() check_file(filename) @@ -117,7 +118,7 @@ def main(): # print the report print( '%10s %s' % ('size', 'filename') ) - for h, f in hashlist.iteritems(): + for h, f in hashlist.items(): if hashcount[h] < 2: # present one time, skip continue @@ -147,7 +148,7 @@ def main(): # final summary print( '%d files checked (%s), %d duplicates (%s).' % ( totalfiles, humanize_size(totalsize), dupfiles, humanize_size(dupsize))) - + return hashlist if __name__ == '__main__': main() diff --git a/fileWalker.py b/fileWalker.py index 28de5ca..6bae9ca 100644 --- a/fileWalker.py +++ b/fileWalker.py @@ -5,9 +5,22 @@ - - - +def walkerAdapter(walker, hiddenFolders=False): + for curDir, dirList, fileList in walker: + for filename in fileList: + filepath = os.path.join( curDir, filename ) + if not hiddenFolders and folderIsHidden(filepath): + continue + yield filepath + +def folderIsHidden(filepath): + par = filepath + while 1: + par, cd = os.path.split(par) + if cd.startswith('.') and not cd == '.': + return True + if not par: + break def getDirName(dirName=None): if dirName: @@ -27,14 +40,23 @@ def getDirName(dirName=None): if os.path.isdir( os.path.normpath( inp )): break return inp + + +def delete_duplicates(hashlist): + pass def main(dirName=None): root = getDirName(dirName) walker = os.walk(root) - - for j in walker: - print(j) + walker = walkerAdapter(walker) + hashlist = ff.main(walker) + inp = input('would you like to remove all duplicates?\n') + if inp.strip() in ['yes', 'y', 'ya']: + delete_duplicates(hashlist) +## for j in hashlist.items(): +## if len(j[1]) >1: +## print(j) if __name__ == '__main__': main() From cd1c581dfcfd3be58b012b70b49a992aaf99f802 Mon Sep 17 00:00:00 2001 From: sesas Date: Tue, 27 Mar 2012 19:10:11 -0700 Subject: [PATCH 4/9] implemented the deleter function, with the option for it to be interactive. Signed-off-by: sesas --- fileWalker.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fileWalker.py b/fileWalker.py index 6bae9ca..5e4f4e3 100644 --- a/fileWalker.py +++ b/fileWalker.py @@ -42,8 +42,16 @@ def getDirName(dirName=None): return inp -def delete_duplicates(hashlist): - +def delete_duplicates(hashlist, interactive=True): + for fl in hashlist.values(): + for filename in fl[1:]: + print('removing:', filename) + if interactive: + inp = input('?').strip() + if not inp in ['yes', 'y', 'ya']: + continue + os.remove(filename) + pass pass def main(dirName=None): @@ -51,9 +59,13 @@ def main(dirName=None): walker = os.walk(root) walker = walkerAdapter(walker) hashlist = ff.main(walker) - inp = input('would you like to remove all duplicates?\n') - if inp.strip() in ['yes', 'y', 'ya']: + inp = input('would you like to remove all duplicates?\n').strip() + if inp in ['int', 'yi']: + delete_duplicates(hashlist, True) + elif inp in ['yes', 'y', 'ya']: delete_duplicates(hashlist) + else: + print('no file was deleted') ## for j in hashlist.items(): ## if len(j[1]) >1: ## print(j) From 2ef7addb4e629a9b4ed7f8e96ef637857377c575 Mon Sep 17 00:00:00 2001 From: sesas Date: Tue, 27 Mar 2012 19:14:35 -0700 Subject: [PATCH 5/9] finished the deleter function with more verbose output that earlier. Signed-off-by: sesas --- fileWalker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fileWalker.py b/fileWalker.py index 5e4f4e3..185ac30 100644 --- a/fileWalker.py +++ b/fileWalker.py @@ -42,14 +42,17 @@ def getDirName(dirName=None): return inp -def delete_duplicates(hashlist, interactive=True): +def delete_duplicates(hashlist, interactive=True, verbose=True): for fl in hashlist.values(): + if len(fl) > 1: + print('keeping:', fl[0]) for filename in fl[1:]: - print('removing:', filename) + print('duplicate:', filename) if interactive: inp = input('?').strip() if not inp in ['yes', 'y', 'ya']: continue + print('deleting:', filename) os.remove(filename) pass pass From 96f6d593a084f9e1edb91403db266a71aba7f9cd Mon Sep 17 00:00:00 2001 From: sesas Date: Tue, 27 Mar 2012 19:49:51 -0700 Subject: [PATCH 6/9] debugged a bit more, especially the folder input function, and fixed some bugs. Signed-off-by: sesas --- fileWalker.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fileWalker.py b/fileWalker.py index 185ac30..89d01ec 100644 --- a/fileWalker.py +++ b/fileWalker.py @@ -17,9 +17,10 @@ def folderIsHidden(filepath): par = filepath while 1: par, cd = os.path.split(par) +## print(par, ':', cd) if cd.startswith('.') and not cd == '.': return True - if not par: + if not par or os.path.ismount(par): break def getDirName(dirName=None): @@ -32,12 +33,14 @@ def getDirName(dirName=None): return out while 1: - inp = input("which folder would you like to find the duplicates in?\n") + inp = print("which folder would you like to find the duplicates in?") +## inp = print("(make sure the path you insert has double \\ in between folders.") + inp = input() if not inp: # mainly for debugging global testPath inp = testPath break - if os.path.isdir( os.path.normpath( inp )): + if os.path.isdir( inp ): break return inp From 1aa8a0f71e36e066ef1fb7ea423b3668cb1bec6c Mon Sep 17 00:00:00 2001 From: sesas Date: Tue, 27 Mar 2012 20:07:18 -0700 Subject: [PATCH 7/9] made it faster (it now checks only the beginning of a file) Signed-off-by: sesas --- UltraFastDuplicateFilesFinder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py index 1850d16..f993de2 100644 --- a/UltraFastDuplicateFilesFinder.py +++ b/UltraFastDuplicateFilesFinder.py @@ -78,7 +78,7 @@ def check_file(filename): Compare the given file to our lists of hashes """ # compute md5 - h = get_file_hash(filename) + h = get_file_hash(filename, CHUNK_SIZE//2) # increase count i = hashcount.get(h, 0) @@ -110,7 +110,8 @@ def main(dirWalker=sys.stdin): # we start here by checking all files for filename in dirWalker: filename = filename.strip() - + if not totalfiles % 100: + print(filename) check_file(filename) totalfiles += 1 totalsize += os.path.getsize(filename) @@ -121,6 +122,7 @@ def main(dirWalker=sys.stdin): for h, f in hashlist.items(): if hashcount[h] < 2: # present one time, skip + continue # reference file From 418bc012086e6428d3ceef8901e0a75bcf50f29f Mon Sep 17 00:00:00 2001 From: sesas Date: Wed, 28 Mar 2012 13:36:49 -0700 Subject: [PATCH 8/9] modified readme to reflect the extensions made. Signed-off-by: sesas --- README | 31 ++++++++++++++++++++++++++++++- test/README | 31 ++++++++++++++++++++++++++++++- test/README_2 | 31 ++++++++++++++++++++++++++++++- test/README_3 | 31 ++++++++++++++++++++++++++++++- 4 files changed, 120 insertions(+), 4 deletions(-) diff --git a/README b/README index 07a6777..cddcda5 100644 --- a/README +++ b/README @@ -1,8 +1,37 @@ -Ultra Fast Duplicate Files Finder +================================= +original Ultra Fast Duplicate Files Finder ================================= by Gautier Portet +forked and extended: + by Gabriel Reyla +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ +On Unix systems the program can be used as follow: Takes a list of file from stdin. And print the duplicate ones. diff --git a/test/README b/test/README index 07a6777..cddcda5 100644 --- a/test/README +++ b/test/README @@ -1,8 +1,37 @@ -Ultra Fast Duplicate Files Finder +================================= +original Ultra Fast Duplicate Files Finder ================================= by Gautier Portet +forked and extended: + by Gabriel Reyla +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ +On Unix systems the program can be used as follow: Takes a list of file from stdin. And print the duplicate ones. diff --git a/test/README_2 b/test/README_2 index 07a6777..cddcda5 100644 --- a/test/README_2 +++ b/test/README_2 @@ -1,8 +1,37 @@ -Ultra Fast Duplicate Files Finder +================================= +original Ultra Fast Duplicate Files Finder ================================= by Gautier Portet +forked and extended: + by Gabriel Reyla +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ +On Unix systems the program can be used as follow: Takes a list of file from stdin. And print the duplicate ones. diff --git a/test/README_3 b/test/README_3 index 07a6777..cddcda5 100644 --- a/test/README_3 +++ b/test/README_3 @@ -1,8 +1,37 @@ -Ultra Fast Duplicate Files Finder +================================= +original Ultra Fast Duplicate Files Finder ================================= by Gautier Portet +forked and extended: + by Gabriel Reyla +The extended version works on Windows too: +------------------------------------------------------------------------------ +$ python fileWalker.py +which folder would you like to find the duplicates in? +. +.\fileWalker.py + size filename + 1306 .\README + 1306 .\test\README + 1306 .\test\README_2 + 1306 .\test\README_3 + +7 files checked (15.24 KiB), 1 duplicates (3.83 KiB). +would you like to remove all duplicates? +yes +keeping: .\README +duplicate: .\test\README +?y +deleting: .\test\README +duplicate: .\test\README_2 +?n +duplicate: .\test\README_3 +?n +>>> +------------------------------------------------------------------------------ +On Unix systems the program can be used as follow: Takes a list of file from stdin. And print the duplicate ones. From dfa6bd0ae27e0130479e0d7136bc086fe3f2177a Mon Sep 17 00:00:00 2001 From: sesas Date: Wed, 28 Mar 2012 14:44:06 -0700 Subject: [PATCH 9/9] added a startup script so that the others are compiled to .pyc Signed-off-by: sesas --- UltraFastDuplicateFilesFinder.py | 4 ++-- duplicate.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 duplicate.py diff --git a/UltraFastDuplicateFilesFinder.py b/UltraFastDuplicateFilesFinder.py index f993de2..58c7bbc 100644 --- a/UltraFastDuplicateFilesFinder.py +++ b/UltraFastDuplicateFilesFinder.py @@ -110,8 +110,8 @@ def main(dirWalker=sys.stdin): # we start here by checking all files for filename in dirWalker: filename = filename.strip() - if not totalfiles % 100: - print(filename) + if not totalfiles % 500: + print('files processed:', totalfiles, filename) check_file(filename) totalfiles += 1 totalsize += os.path.getsize(filename) diff --git a/duplicate.py b/duplicate.py new file mode 100644 index 0000000..c3a937b --- /dev/null +++ b/duplicate.py @@ -0,0 +1,4 @@ +import fileWalker + +if __name__ == '__main__': + fileWalker.main()