Получение статистики по файлам в директории: различия между версиями

Версия 00:07, 8 марта 2010

Для получения статистики по типам файлов в директории можно с помощью следующего скрипта. Помимо основной цели скрипт может "сортировать файлы по директориям, исходя из их типа. Для получения типа файла используется эвристический анализатор file:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Script generates statistics table for specified direcories sorted by type(by
# default), overall files size or files count. This script can also sort files
# into the directories by type using hardlinks(default) or softlinks. Type
# determination is done by `file` utility.
# 
# To get help just run script with wrong parameters.

import os
import sys
import subprocess
import string
import getopt
import operator

def usage():
    print "Usage: fmetric.py [-C|-S] [-h] [-m] [-b [-s]] [-q] DIR [DIR] ...\n\
    \t-C\tSort by files count\n\
    \t-S\tSort by overall size\n\
    \t-h\tShow sizes in human-readable format\n\
    \t-m\tUse MIME types instead of file descriptions\n\
    \t-b\tCreate backup directory\n\
    \t-s\tUse softlinks (only useful with -b option)\n\
    \t-q\tUse \"quiet\" mode (do not write any error messages, skip on errors)"


def file_info(filename, flags):
    file_binary = "/usr/bin/file"
    (file_stdout, file_stderr) = subprocess.Popen( \
           [file_binary, flags, filename], \
           stdout=subprocess.PIPE,\
           stderr=subprocess.PIPE).communicate()
    return file_stdout.split(",")[0].strip('\n')


def sizeof_fmt(num, readable):
    if not readable:
        return num

    for x in ['bytes','KB','MB','GB','TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def main(argv):
    # Initializing basic variables
    backup = False
    readable = False
    use_symlinks = False
    quiet_mode = False
    info_flags = "-b"
    sort_key = 0
    info_list = []
    size_list = []
    count_list = []

    # Parsing command-line arguments
    try:
        opts, args = getopt.getopt(argv, "CShmbsq")
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt == "-h":
            readable = True

        if (opt == "-S"): 
            if (sort_key > 0):
                usage()
                sys.exit(2)

            sort_key = 1

        if (opt == "-C"):
            if (sort_key > 0):
                usage()
                sys.exit(2)
                
            sort_key = 2

        if (opt == "-b"):
            backup = True

        if (opt == "-m"):
            info_flags = "-bi"

        if (opt == "-s"):
            use_symlinks = True

        if (opt == "-q"):
            quiet_mode = True

    # Walk through the files and gather stats
    for dir in args:
        backup_dir = os.path.join(dir, "backup")
        if backup and os.path.exists(backup_dir):
            print "Backup directory \"%s\" exists, exiting" % backup_dir
            sys.exit(1)

        for root, dirs, files in os.walk(dir):
            for name in files:
                filename = os.path.join(root, name)

                # This might cause some errors for moved files, broken links,
                # etc. Output all errors to stderr, do not stop the script
                try:
                    size = os.path.getsize(filename)
                except OSError, (errno, strerror):
                    if not quiet_mode:
                        sys.stderr.write( "Error reading file size for %s: %s\n" % \
                                (filename, strerror) )
                    continue

                info = file_info(filename, info_flags)

                # Add stats to the comparison table
                if info in info_list:
                    index = info_list.index(info)
                    size_list[index] += size
                    count_list[index] += 1
                else:
                    info_list.append(info)
                    size_list.append(size)
                    count_list.append(1)

                # Create backup hardlinks/symlinks
                if backup:
                    backup_path = os.path.join(backup_dir, info)
                    link_path = os.path.join(backup_path, name)
                    if not os.path.exists(backup_path):
                        os.makedirs(backup_path)

                    try:
                        if  use_symlinks:
                            os.symlink(filename, link_path)
                        else:
                            os.link(filename, link_path)

                    except OSError, (errno, strerror):
                        if not quiet_mode:
                            sys.stderr.write( "Error creating link %s: %s\n" % \
                                    (link_path, strerror) )


    info_field_size = len(max(info_list, key=len))
    
    # Print header
    print "%s | %s | %s" % ( string.ljust("Type",info_field_size), \
            string.ljust("Size", 12), \
            "Count")
    print string.ljust("", info_field_size + 3 + 12 + 3 + 12, "=")
    

    # Print the result
    for info, size, count in sorted(zip(info_list, size_list, count_list), \
            key = operator.itemgetter(sort_key), reverse = (sort_key <> 0)):
        print "%s | %s | %d" % (string.ljust(info, info_field_size), \
                string.ljust(str(sizeof_fmt(size, readable)), 12), \
                count )
                

if __name__ == '__main__':
    if len(sys.argv) < 2:
        usage()
        sys.exit(2)

    main(sys.argv[1:])

Скрипт также будет полезен в связке с другим маленьким скриптом, позволяющим отсортировать найденные файлы в директории lost+found по общим директориям.

#!/bin/bash
#
# This script intends to sort files in the lost+found directory to the
# directories depending on the files' parent inodes. This can save much time
# during hand-recovering
#
# The one and only argument is `lost+found`-like directory. No checks are done.
# You are warned.

find $1 -maxdepth 1 -name '*_*'|while read FILE ; 
do
    FILE_NAME=`basename $FILE`
    DIR_NAME=$1/${FILE_NAME%%_*}
    mkdir -p $DIR_NAME
    mv $FILE $DIR_NAME
done

Самые свежие версии файла будут находиться на GitHub.com

Версия 00:05, 8 марта 2010 (просмотреть исходный код) Jolly Roger (обсуждение \| вклад) (Создана новая страница размером Для получения статистики по типам файлов в директории можно с помощью след...)		Версия 00:07, 8 марта 2010 (просмотреть исходный код) Jolly Roger (обсуждение \| вклад) Следующая правка →
Строка 200:		Строка 200:
	Самые свежие версии файла будут находиться на [http://gist.github.com/315699 GitHub.com]		Самые свежие версии файла будут находиться на [http://gist.github.com/315699 GitHub.com]

	[[Category:Скрипт]][[Category:Bash]]		[[Category:Скрипт]][[Category:Bash]][[Category:Python]]

Получение статистики по файлам в директории: различия между версиями

Версия 00:07, 8 марта 2010

Навигация

Действия на странице

Действия на странице

Персональные инструменты

Навигация

Поиск

Участникам

Инструменты