#!/usr/bin/python
#
# $Id$
#
#  program that runs through a set of directory trees, and 
#  finds the files in it that are identical. It then gives a choice
#  of one of the files, and then deletes the rest.
#
#  Version 0.1 Copyright Duncan Robertson duncan@linuxbandwagon.com
#  July 2001
#
#  Version 0.2 - display mode, plus doesnt use external crc check, uses
#                pythonic md5sum
#
#  This program is released under the Gnu General Public License
#  see http://www.gnu.org/licenses/gpl.html for further information.
#
#  currently hardcoded to use md5sum, but could use any method to collect
#  a reliable crc.
#

from snack import *
import  os.path, string, sys, commands, re
import md5

pverbose = 0

def debug(bla):
   pass
#   print bla

def verbose(bla):
    if pverbose:
       print bla

checksums = {}

ignorelinks = 0


def checksum(filename):
   thefile = open(filename)
   checksum = md5.new(thefile.read()).digest()
   thefile.close()
   return checksum

#
# checksums is a dictonary of lists of filenames...
#
# this function adds a filename to the list if the cheksum exists,
# or adds a new list with that filename and checksum otherwise
#

def checksumsadd(checksum,newpath):
   global checksums

   if not checksums.has_key(checksum):
      checksums[checksum] = []
   if not newpath in checksums[checksum]:
      checksums[checksum].append(newpath)
   else:
      debug(newpath+' already in dictionary, skipping')
   
      
def getchecksums(arg, dirname, names):

   for name in names:
      newpath = os.path.normpath(dirname+'/'+name)
      verbose('Processing '+newpath)
      if os.path.isfile(newpath):
         if os.path.islink(newpath):
            if ignorelinks:
               verbose('symlink found for '+newpath)
            else:
               checksumsadd(checksum(newpath),newpath)
         else:
            checksumsadd(checksum(newpath),newpath)
      else:
         verbose('not file : '+newpath)

if len(sys.argv) < 2:
   print '''

fixdupes.py <-n> <-v> [dir] <dir> ...

-n is non interactive mode, just prints a list of identical files
-v displays some info as it is processing

duncan@zog.net.au
'''
   sys.exit(0)


noninteractive = 0
for dir in sys.argv[1:]:
   if dir == "-n":
      noninteractive = 1
      continue
   if dir == "-v":
      pverbose = 1
      continue
   
   os.path.walk(dir, getchecksums, 'arg')

#
#  we now have a dictionary of checksums for the files
#
if noninteractive:
   print
   print 'These files are unique:'
   for checksum in checksums.keys():
      if len(checksums[checksum]) == 1:
         print checksums[checksum][0]
   print 'These files are duplicates:'
   for checksum in checksums.keys():
      if len(checksums[checksum]) > 1:
         print '-----------------------------'
         for filename in checksums[checksum]:
            print filename
   sys.exit(0)

screen = SnackScreen()

for checksum in checksums.keys():
   if len(checksums[checksum]) > 1:
      sresult = ListboxChoiceWindow(screen, 'Duplicate Files',
         'Hitting enter will keep only the one selected: ',
         checksums[checksum],
         ('delete all but this','skip','quit'),
         40,1,9)

      if sresult[0] == 'quit':
         screen.finish()
         sys.exit(0)

      if sresult[0] == 'skip':
         pass
      else:
         del checksums[checksum][sresult[1]]
         # mainserver.debugscreen(str(checksums[checksum]),screen)
         for file in checksums[checksum]:
            try:
               os.remove(file)
            except:
               # we should fix this exception handling so it gives some
               # more feedback and options...
               screen.finish()
               print 'bailing, problem deleting '+file
               sys.exit(1)
               
      
screen.finish()
