time/datasource/sceau/naming_conventions_files.py from RR0/rr0.org

time/datasource/sceau/naming_conventions_files.py
Summary

Maintainability

6 days
Test Coverage

Issues
# coding: utf-8

# toutes les chaines sont en unicode (même les docstrings)
from __future__ import unicode_literals

import getopt
import os
import re
import sys
import unicodedata

import unidecode


# rules


def ridoffbadchars(s):
    r = s;
    # as a last resort get rid of still unknown unicode characters
    r = re.sub("\uf018", "_", r);
    r = re.sub("\uf019", "_", r);
    r = re.sub("\uf020", "_", r);
    r = re.sub("\uf021", "_", r);
    r = re.sub("\uf022", "_", r);
    r = re.sub("\uf023", "_", r);
    r = re.sub("\uf024", "_", r);
    r = re.sub("\uf025", "_", r);
    r = re.sub("\uf026", "_", r);
    r = re.sub("\uf027", "_", r);
    r = re.sub("\uf028", "_", r);
    r = re.sub("\uf029", "_", r);
    r = re.sub("\uf030", "_", r);
    r = re.sub("\uf031", "_", r);

    return r;


def accentsTidyP2(s):
    # N° decomes n_
    r = re.sub("ndeg ", "#", s);
    r = re.sub("ndeg", "#", s);
    # remove too many '_'
    r = re.sub("__", "_", r);
    r = re.sub("__", "_", r);
    # remove too many '_-_'
    r = re.sub("_-_", "-", r);
    # remove '_' at start or end of dir name in the chain of subs (_sub1_\_sub2_\_sub3_) => (sub1\sub2\sub3)
    # replace '\_' by '\', '_\' by '\'
    litteral_backslash = r"\\";
    litteral_b1 = r"\\_";
    litteral_b2 = r"_\\";
    r = re.sub(litteral_b1, litteral_backslash, r);
    r = re.sub(litteral_b2, litteral_backslash, r);
    # important, a '_' at beginning or end of dir chain should also be removed, else won't be able to rename
    r = re.sub('^_', '', r);
    r = re.sub('_$', '', r);

    r = re.sub("degdeg", "deg", r);
    r = re.sub("degdeg", "deg", r);
    r = re.sub("degdeg", "deg", r);
    r = re.sub("degdeg", "deg", r);
    r = re.sub("degdeg", "deg", r);

    r = re.sub('_deg$', '', r);
    r = re.sub('deg$', '', r);

    # as a last resort get rid of still unknown unicode characters
    r = ridoffbadchars(r);

    return r;


def accentsTidyP1(s):
    # tries to convert as best as it can anything in ascii characters (special case : '°' becomes 'deg' !)
    r = unidecode.unidecode(s)
    # go lower case
    r = r.lower();
    # æ becomes ae
    r = re.sub("/æ/g", "ae", r);
    # œ becomes oe
    r = re.sub("/œ/g", "oe", r);

    return r;


def accentsTidy(s):
    # special case of '.\' to keep for the root dirs...
    if ((s[0] == '.') and (s[1] == '\\')):
        return '.' + '\\' + accentsTidy(s[2:]);

    r = accentsTidyP1(s);
    # anything that is neither a letter nor a number nor a \ nor '-' is replaced by _
    r = re.sub("[^-\d\w\\\/]", '_', r);
    return accentsTidyP2(r);


def accentsTidyFiles(s):
    # special case of '.\' to keep for the root dirs...
    if ((s[0] == '.') and (s[1] == '\\')):
        return '.' + '\\' + accentsTidy(s[2:]);

    r = accentsTidyP1(s)
    # anything that is neither a letter nor a number nor a \ nor '-' nor '.' is replaced by _
    r = re.sub("[^-.(&)#\d\w\\\/]", '_', r);
    # only keep the last '.'
    pcount = r.count('.')
    if (pcount > 1):
        r = r.replace('.', '_', pcount - 1);
    return accentsTidyP2(r);


def lastCleanup(sout):
    sout = re.sub('_\.', '.', sout);
    sout = re.sub('\-\.', '.', sout);
    sout = re.sub('_\-', '_', sout);  # les _- -->  _
    sout = re.sub('\-$', '', sout);  # retirer les - à la fin
    return sout


minYear = 1880
maxYear = 2200


def correctPureYMD(s, regexen):
    matched = regexen.search(s)
    if matched:  # push it to the front, order reversed
        splits = regexen.split(s)
        if int(splits[1]) > maxYear:
            return (s, False)
        if int(splits[1]) < minYear:
            return (s, False)
        if int(splits[2]) > 12:
            return (s, False)
        if int(splits[3]) > 31:
            return (s, False)
        sout = splits[1] + '-' + splits[2] + '-' + splits[3] + '_' + splits[0]
        for i in range(4, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)
    return (s, False)


def correctYMD(s, regexen, regexus):
    matched = regexen.search(s)
    if matched:  # push it to the front, order reversed
        splits = regexen.split(s)
        if splits[2] != splits[4]:
            return (s, False)
        if int(splits[5]) > maxYear:
            return (s, False)
        if int(splits[5]) < minYear:
            return (s, False)
        if int(splits[3]) > 12:
            return (s, False)
        if int(splits[1]) > 31:
            return (s, False)
        sout = splits[5] + '-' + splits[3].zfill(2) + '-' + splits[1].zfill(2) + '_' + splits[0]
        for i in range(6, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)

    matched = regexus.search(s)
    if matched:  # push it to the front, order unchanged
        splits = regexus.split(s)
        splits3 = splits[3]
        splits5 = splits[5]
        if splits[2] != splits[4]:
            return (s, False)
        if int(splits[1]) > maxYear:
            return (s, False)
        if int(splits[1]) < minYear:
            return (s, False)
        if (splits3 != 'xx'):
            if int(splits3) > 12:
                return (s, False)
            else:
                splits3 = splits3.zfill(2)
        if (splits5 != 'xx'):
            #        print(splits5)
            #        print(s)
            if int(splits5) > 31:
                return (s, False)
            else:
                splits5 = splits5.zfill(2)
        sout = splits[1] + '-' + splits3 + '-' + splits5 + '_' + splits[0]
        for i in range(6, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)
    return (s, False)


def correctYM(s, regexen, regexus):
    matched = regexen.search(s)
    if matched:  # push it to the front, order reversed
        splits = regexen.split(s)
        if int(splits[3]) > maxYear:
            return (s, False)
        if int(splits[3]) < minYear:
            return (s, False)
        if int(splits[1]) > 12:
            return (s, False)
        sout = splits[3] + '-' + splits[1].zfill(2) + '_' + splits[0]
        for i in range(4, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)

    matched = regexus.search(s)
    if matched:  # push it to the front, order unchanged
        splits = regexus.split(s)
        if int(splits[1]) > maxYear:
            return (s, False)
        if int(splits[1]) < minYear:
            return (s, False)
        if int(splits[3]) > 12:
            return (s, False)
        sout = splits[1] + '-' + splits[3].zfill(2) + '_' + splits[0]
        for i in range(4, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)
    return (s, False)


def correctY(s, regex):
    matched = regex.search(s)
    if matched:  # push it to the front, order reversed
        splits = regex.split(s)
        if int(splits[1]) > maxYear:
            return (s, False)
        if int(splits[1]) < minYear:
            return (s, False)
        sout = splits[1] + '_' + splits[0]
        for i in range(2, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)
    return (s, False)


def correctYY(s, regex):
    matched = regex.search(s)
    if matched:  # push it to the front, separated by -
        splits = regex.split(s)
        if int(splits[1]) > maxYear:
            return (s, False)
        if int(splits[1]) < minYear:
            return (s, False)
        if int(splits[3]) > maxYear:
            return (s, False)
        if int(splits[3]) < minYear:
            return (s, False)
        sout = splits[1] + '-' + splits[3] + '_' + splits[0]
        for i in range(4, (len(splits))):
            sout = sout + splits[i]
        sout = accentsTidyP2(sout);
        sout = lastCleanup(sout)
        return (sout, True)
    return (s, False)


# Regex built using https://regex101.com/ and https://docs.python.org/3/library/re.html
# 3 groups DD MM YYYY
# 2 groups separator
regExDatesDDMMYYYY = re.compile(r'(\d\d*)([\/\-_ ])(\d\d*)([\/\-_ ])(\d\d\d\d)')
regExDatesMMYYYY = re.compile("(\d\d*)([\/\-_ ])(\d\d\d\d)")
regExDatesYYYYMMDD = re.compile(r'(\d\d\d\d)([\/\-_ ])([x\d][x\d*])([\/\-_ ])([x\d][x\d*])')
regExDatesYYYYMM = re.compile('(\d\d\d\d)([\/\-_ ])(\d\d*)')
regExDatesYYYY = re.compile("(\d\d\d\d)")

regExDatesPureYYYYMMDD = re.compile(r'(\d\d\d\d)(\d\d)(\d\d)')

regExDatesYYYYtoYYYY = re.compile('(\d\d\d\d)([\/\-_ ])(\d\d\d\d)')


def subs_dates(s):
    (s, matched) = correctYY(s, regExDatesYYYYtoYYYY)
    if not matched:
        (s, matched) = correctYMD(s, regExDatesDDMMYYYY, regExDatesYYYYMMDD)
        if not matched:
            (s, matched) = correctYM(s, regExDatesMMYYYY, regExDatesYYYYMM)
            if not matched:
                (s, matched) = correctPureYMD(s, regExDatesPureYYYYMMDD)
                if not matched:
                    (s, matched) = correctY(s, regExDatesYYYY)
    return s


logFileName = 'logrename.txt'
logFile = open(logFileName, 'a')


def printandlog(str):
    print(str)
    logFile.write(str + "\n")
    return


def printandlogNoRC(str):
    print(str, end='')
    logFile.write(str)
    return


def printandlogTuple(tuple):
    print(tuple)
    logFile.write(''.join(tuple))
    return


def same_string(a, b):
    return ((a.find(b) == 0) and (b.find(a) == 0))


def process(do_rename_dir, do_rename_files, do_dates=0):
    print("A sound will be produced at the end of the processing.")
    print("What follows is also logged into the file logRename.txt\r\n")
    print('Press <ctrl>+C to abort')
    printandlog("########################################")

    # Set the directory you want to start from
    rootDir = '.'
    nbChanges = 0

    # Simulation : we only parse the subs, don't need the full path
    if (not do_rename_dir):
        nbDir = 0
        for dirName, subdirList, fileList in os.walk(rootDir):
            for subdir in subdirList:
                if subdir == '__pycache__':
                    continue
                stripped = accentsTidy(subdir)
                if (do_dates):
                    stripped = subs_dates(stripped)
                nbDir = nbDir + 1
                if (same_string(stripped, subdir)):
                    printandlogNoRC('.');
                    sys.stdout.flush()
                else:
                    nbChanges = nbChanges + 1
                    printandlog("\n" + dirName + "\\" + stripped + "  <--  " + dirName + "\\" + ridoffbadchars(subdir));
    else:
        # Real Deal, we parse all the dirs using path from the rootDir, one by one, in O(n2)... Slow, not smart, but small code.
        found = 1
        while (found):
            found = 0
            nbDir = 0
            for dirName, subdirList, fileList in os.walk(rootDir):
                if dirName == '..':
                    continue
                if dirName == '.\\__pycache__':
                    continue

                for subdir in subdirList:
                    if subdir == '__pycache__':
                        continue
                    if subdir == '.':
                        continue
                    if subdir == '..':
                        continue
                    stripped = accentsTidy(subdir)
                    if (do_dates):
                        stripped = subs_dates(stripped)
                    nbDir = nbDir + 1
                    if (same_string(stripped, subdir)):
                        printandlogNoRC('.');
                        sys.stdout.flush()
                    else:
                        if (not found):
                            nbChanges = nbChanges + 1
                            printandlog("");
                            strippedFullPath = dirName + "\\" + stripped
                            origFullPath = dirName + "\\" + ridoffbadchars(subdir)
                            printandlog("\n" + strippedFullPath + "  <--  " + ridoffbadchars(subdir));
                            try:
                                os.rename(origFullPath, strippedFullPath)
                            except:
                                print(
                                    "Dir. renaming was refused by the PC. Check that you don't have a file open in the file tree. You might need to close some file explorer. You may also be in a situation of dir already existing due to a previous renaming");
                                return;
                            found = 1
                            break

    # BELL
    print('\a')
    printandlog('\n')

    if (nbChanges == 0):
        printandlog("no change needed, all %i directories are already in compliance" % nbDir)
    else:
        if (do_rename_dir):
            printandlog("%i change(s) done in a total of %i directories" % (nbChanges, nbDir))
        else:
            printandlog("%i change(s) needed in a total of %i directories" % (nbChanges, nbDir))
            printandlog(
                "this was a simulation, if you are happy with this renaming proposal, use 'naming_conventions_do_rename.py'");
    print("\nAll this was logged at the end of the file " + logFileName);

    nbFiles = 0
    nbFileChanges = 0

    if (not do_rename_files):
        for dirName, subdirList, fileList in os.walk(rootDir):
            if dirName == '..':
                continue
            if dirName == '.\\__pycache__':
                continue
            for file in fileList:
                stripped = accentsTidyFiles(file)
                if (do_dates):
                    stripped = subs_dates(stripped)
                nbFiles = nbFiles + 1
                if (same_string(stripped, file)):
                    printandlogNoRC('*');
                    sys.stdout.flush()
                else:
                    nbFileChanges = nbFileChanges + 1
                    printandlog("\n" + dirName + "\\" + stripped + "  <--  " + ridoffbadchars(file));

    else:
        # Real Deal, we parse all the files using path from the rootDir, one by one, in O(n2)... Slow, not smart, but small code.
        found = 1
        while (found):
            found = 0
            for dirName, subdirList, fileList in os.walk(rootDir):
                if dirName == '.\\__pycache__':
                    continue

                for file in fileList:
                    stripped = accentsTidyFiles(file)
                    if (do_dates):
                        stripped = subs_dates(stripped)
                    nbFiles = nbFiles + 1
                    if (same_string(stripped, file)):
                        printandlogNoRC('*');
                        sys.stdout.flush()
                    else:
                        nbFileChanges = nbFileChanges + 1
                        orig_full_path = dirName + "\\" + file
                        stripped_full_path = dirName + "\\" + stripped;
                        printandlog("\n" + dirName + "\\" + stripped + "  <--  " + ridoffbadchars(file));
                        try:
                            os.rename(orig_full_path, stripped_full_path)
                        except:
                            print(
                                "File renaming was refused by the PC. Check that you don't have a file open in the file tree. You might need to close some file explorer. You may also be in a situation of file already existing due to a previous renaming");
                            return;
                        found = 1

    # BELL
    print('\a')
    printandlog('\n')

    if (nbFileChanges == 0):
        printandlog("no change needed, all %i files are already in compliance" % nbFiles)
    else:
        if (do_rename_files):
            printandlog("%i change(s) done in a total of %i files" % (nbFileChanges, nbFiles))
        else:
            printandlog("%i change(s) needed in a total of %i files" % (nbFileChanges, nbFiles))
            printandlog(
                "this was a simulation, if you are happy with this renaming proposal, use 'naming_conventions_do_rename.py'");
    print("\nAll this was logged at the end of the file " + logFileName);


def test():
    s = 'æÁÀÂÄÃÅÇÉÈÊËÍÏÎÌÑÓÒÔÖÕÚÙÛÜÝœ- áàâäãåçéèêëíìîïñóòôöõúùûüý ÿ'
    print(s)
    s1 = accentsTidy(s)
    print(s1)
    s2 = unicodedata.normalize('NFD', s1).encode('ascii', 'ignore')
    print();
    print(s1)
    print();


def print_syntax():
    print('syntax for simulation, with no effect on the name of the files or directories, for validation purposes:')
    print('naming_conventions_files.py')
    print('syntax for renaming effectively the files:')
    print('naming_conventions_files.py -w')
    print('syntax for renaming effectively the directories:')
    print('naming_conventions_files.py -d')
    print(
        'additional option -t to use if one want dates to be moved to the front of the names and reorganized in YYYY-MM-DD format')
    print('all 3 options can be combined')
    os.system("pause")


def main(argv):
    if (len(sys.argv) == 1):
        process(0, 0, 0)
        os.system("pause")
    else:
        do_files = 0
        do_dir = 0
        do_dates = 0
        try:
            opts, args = getopt.getopt(argv, "hwdt")
        except getopt.GetoptError:
            print_syntax()
            sys.exit(2)
        for opt, arg in opts:
            if opt == '-t':
                do_dates = 1
            elif opt == '-w':
                do_files = 1
            elif opt == '-d':
                do_dir = 1
            elif opt == '-h':
                print_syntax()
                sys.exit()

        process(do_dir, do_files, do_dates)
        os.system("pause")
        sys.exit()


if __name__ == "__main__":
    main(sys.argv[1:])