Consolidation program

Dave Smith dave at thesmithfam.org
Wed Oct 10 13:17:45 MDT 2007


Dave Smith wrote:
> I have a co-worker who has hundreds of duplicate large files strewn 
> about his Linux file system, and I want to help him consolidate them. 
> What I have in mind is a program that will find all files with a 
> certain extension, md5sum them all, and then create a master directory 
> with the files, and replace all the duplicates with symbolic links. 
> Does anyone know of such a utility?

I wrote my own little solution Qt/C++, called qduper. It's not a GUI 
app, just runs in the console, but uses things like QDir, QString, 
QFileInfo, and QProcess to ease the job. I wrote it against Qt 4.3.2, 
but it should work with any Qt 4 version. It most likely will not 
compile against Qt 3 (any version). It only runs on *nix, not Windows. 
It might run on Mac, though it's untested.

The program recursively finds all duplicates in a list of directories, 
and moves one of the duplicate files from each set to a master 
directory, symbolically linking to it from all the other duplicates and 
its previous location. Oh, and you can specify a file suffix so it only 
operates on files that end in a certain extension (or leave it blank). 
If you've got a lot of copied files lying around, this might help you 
clean them up.

If anyone finds it useful, let me know. I sure did.

--Dave

P.S. This certainly isn't a shining example of Qt's awesomeness, but it 
works okay'ish, and the code isn't awful. :)
-------------- next part --------------
#include <QDirIterator>
#include <QStringList>
#include <QByteArray>
#include <QtGlobal>
#include <QIODevice>
#include <QProcess>
#include <QString>
#include <QtDebug>
#include <QSet>
#include <QDir>

int main( int argc, char **argv )
{
    if( argc < 4 )
    {
        printf( "qduper finds duplicate files in a directory (and sub-directories), moves one of each duplicate set to a master directory, and then sym-links all the other duplicates to it. It preserves all files names.\n" );
        printf( "\n" );
        printf( "Usage: %s <file suffix> <master directory> <directory to search> [<directory to search> ... ]\n", argv[0] );
        return 1;
    }

    QString fileSuffix( argv[1] );
    QString masterDirectory( argv[2] );
    QSet<QString> directorySet;
    for( int i=3; i<argc; i++ )
        directorySet << argv[i];

    if( ! QDir(masterDirectory).exists() )
    {
        qWarning() << "Error: The specified master directory" << masterDirectory << "does not exist.";
        return 1;
    }

    masterDirectory = QFileInfo(masterDirectory).canonicalFilePath();

    qDebug() << "";
    qDebug() << "File suffix:     " << fileSuffix;
    qDebug() << "Master directory:" << masterDirectory;
    qDebug() << "Directory list:  " << QStringList(directorySet.toList()).join( ", " );
    qDebug() << "";

    QRegExp md5regex( "^([a-z0-9]+)\\s*", Qt::CaseInsensitive );
    QMap<QString,QSet<QString> > md5map;
    foreach( QString directory, directorySet )
    {
        QDirIterator dirIterator( directory, QDirIterator::Subdirectories );
        while( dirIterator.hasNext() )
        {
            QString fileName = dirIterator.next();
            QFileInfo fileInfo( fileName );
            fileName = fileInfo.canonicalFilePath();
            if( fileInfo.isFile() && fileName.endsWith( fileSuffix ) )
            {
                if( fileInfo.dir() == QDir(masterDirectory) )
                {
                    qDebug() << "Skipping" << fileName << "beacuse it is inside the master directory.";
                    continue;
                }
                else if( fileInfo.isSymLink() )
                {
                    qDebug() << "Skipping" << fileName << "beacuse it is a symbolic link.";
                    continue;
                }

                QProcess process;
                process.start( "md5sum", QStringList() << fileName, QIODevice::ReadOnly );
                process.waitForFinished( -1 );

                if( process.exitCode() == 0 && process.exitStatus() == QProcess::NormalExit )
                {
                    QString md5output = process.readAllStandardOutput();
                    if( md5regex.indexIn( md5output ) != -1 )
                    {
                        QString md5 = md5regex.cap(1);
                        md5map[ md5 ].insert( fileName );
                    }
                }
                else
                {
                    qWarning() << "Warning:" << process.readAllStandardError().trimmed();
                }
            }
        }
    }

    int duplicateCount = 0;
    foreach( QString md5, md5map.keys() )
    {
        QStringList duplicateFileNames = md5map[md5].toList();
        if( duplicateFileNames.count() > 1 )
        {
            duplicateCount += duplicateFileNames.count();
            QString masterFile = duplicateFileNames[0];
            qDebug() << "I will move" << masterFile << "to directory" << masterDirectory;
            qDebug() << "   And sym-link the following files to it:";
            for( int i=1; i<duplicateFileNames.count(); i++ )
                qDebug() << "      " << duplicateFileNames[i];
        }
    }

    if( duplicateCount == 0 )
    {
        qDebug() << "No duplicates found with file suffix" << fileSuffix;
        return 0;
    }

    printf( "\nDo you want to do this?\n  yes or no: " );
    fflush( stdout );
    char answer[16];
    scanf( "%s", answer );
    if( ! QString(answer).toLower().contains( "y" ) )
    {
        qDebug() << "\nNot doing anything.\n";
        return 0;
    }

    foreach( QString md5, md5map.keys() )
    {
        QStringList duplicateFileNames = md5map[md5].toList();

        if( duplicateFileNames.count() > 1 )
        {
            QString masterFile = duplicateFileNames[0];
            QString masterFileNewName = masterDirectory + "/" + QFileInfo(masterFile).fileName();
            if( QProcess::execute( "mv", QStringList() << "-v" << masterFile << masterFileNewName ) == 0 )
            {
                 Q_ASSERT( ! QFile::exists(masterFile) );
                 if( QProcess::execute( "ln", QStringList() << "-s" << masterFileNewName << masterFile ) == 0 )
                 {
                     for( int i=1; i<duplicateFileNames.count(); i++ )
                     {
                         QString fileName = duplicateFileNames[i];
                         if( QProcess::execute( "rm", QStringList() << "-v" << fileName ) == 0 )
                         {
                             if( QProcess::execute( "ln", QStringList() << "-s" << masterFile << fileName ) != 0 )
                             {
                                 qWarning() << "Error: Could not create sym-link from" << fileName << "to" << masterFile;
                             }
                         }
                         else
                         {
                             qWarning() << "Error: Could not remove file" << fileName << "to make it a sym-link.";
                         }
                     }
                 }
                 else
                 {
                     qWarning() << "Error: Could not sym-link master file" << masterFile << "to the master directory" << masterFileNewName;
                 }
            }
            else
            {
                qWarning() << "Error: Could not move file" << masterFile << "to the master directory" << masterDirectory;
            }
        }
    }

    return 0;
}


More information about the PLUG mailing list