davideaves.com

Live in a world of your own, but always welcome visitors.

View on GitHub
14 August 2014

Identify duplicate files in current working directory by file hash and take user specified action.

by deaves

There are a lot of programs out there that will locate duplicate files on a filesystem. I however, prefer to use standard system utilities on my home system. So I wrote a quick shell script to identify duplicate files and either create hard links between them or prompt me on which files to delete outright.

Feel free to review, modify or use this script however you see fit. Remember you do so at your own risk!

#!/bin/bash
## Created by: deaves
# Identify duplicate files in current working directory by file hash and take user specified action.
#
## Requires: dialog, findutils

# Required script variables.
filehash_dump="/tmp/$USER-$$.hash"
dialog_select="/tmp/$USER-$$.select"
report_file="$HOME/uDupeReport-$$.txt"

# FUNCTION: End Script if error.
DIE() {
 echo "ERROR: Validate \"$_\" is installed and working on your system."
 exit 0
}

# Validate script requirements are meet.
type -p dialog > /dev/null || DIE

# Create the filehash_dump file containing MD5 finger prints.
find ./ -type f -exec md5sum $1 {} \; | tee "$filehash_dump"

# Prompt user for next action to take w/6 hour wait before continuing on.
echo
read -t 21600 -p "DELETE or LINK duplicate files?
If NO response create a report: $report_file
Type command in CAPS: " OPT
echo

# Begin loop with all file hashes.
awk '{print $1}' "$filehash_dump" | sort | uniq | while read HASH
 do
 COUNT=0

 # Read all the files of the hash into an Array.
 eval FILES=( "$(grep ^"$HASH" "$filehash_dump" | sed 's/'"$HASH"'  //g;s/^/"/g;s/$/"/g')" )

if [ "$OPT" == "DELETE" ]; then

 # If more than a single file exists then take action.
 if [ "${#FILES[@]}" -gt "1" ]; then

  # Prompt user on which file to keep and store it in dialog_select.
  dialog --backtitle "$0 - uDupe File Killer" \
    --title "Select the file to keep. All others files will be deleted!" \
    --menu "HASH: $HASH\nTYPE:$(file "${FILES[${COUNT}]}" | awk -F': ' '{$1=""; print}')" \
    17 70 14 \
  $(while [ "$COUNT" -lt "${#FILES[@]}" ]; do
   printf " $COUNT "
   printf "${FILES[${COUNT}]}" | sed 's/ /_/g'
   let COUNT++
  done) 2> "$dialog_select"

  # Trap error code from dialog & perform actions.
  if [ "$?" == "0" ]; then

   # User selected a file.
   ANS=`cat "$dialog_select"`; rm "$dialog_select"
   unset FILES[${ANS}]

   # Perform action against unselected files.
   for file in "${FILES[@]}"; do
    rm "$file"
   done

  else

   # User choose to cancel.
   rm "$dialog_select"
   rm "$filehash_dump"
   break

  fi

 fi

elif [ "$OPT" == "LINK" ]; then

 # hardLink all the duplicate files to save disk space.
 if [ "${#FILES[@]}" -gt "1" ]; then
  echo "Creating hardlink: $HASH"
  echo " > ${FILES[0]}"
  ORIG="${FILES[0]}"; unset FILES[0]
  for file in "${FILES[@]}"; do
   echo " > $file"
   rm "$file"
   ln "$ORIG" "$file"
  done
 fi

else

 # Just Report all the duplicate files.
 [ ! -f "$report_file" ] && { printf "uDupe Report: $(pwd)\n$(date)\n\n" > "$report_file" ;}
 if [ "${#FILES[@]}" -gt "1" ]; then
  echo "HASH: $HASH"
  echo "TYPE:$(file "${FILES[${COUNT}]}" | awk -F': ' '{$1=""; print}')"
  for file in "${FILES[@]}"; do
   echo " > $file"
  done
  echo
 fi | tee -a "$report_file"

fi

done

# Script is finished, delete the filehash_dump file.
[ -e "$filehash_dump" ] && { rm "$filehash_dump" ; }
tags: