Codecount - Python

I had been using cloc.exe quite often to count my source code and understand how large various files were. What I ran into was the need to count other files or languages that I could use cloc for but I would need to configure it. I looked at the source but it was in Perl so I thought it would be a good opportunity to get a little coding done using Python.

Counting Code

To accomplish this task I started to think through the things I would need to do:

  • Handle command line arguments.

  • Walk through all directories to get the files.
    • Ignore looking through "special" directories.
  • Scan each file
    • Know how to tell code from comments in various languages.
    • Know if a file is a duplicate.
    • Keep track of code, comments, blanks, and total.
  • Produce results
    • By File
    • By Language

Most of these are pretty straight forward. The most interesting part is trying to make some kind of parsing that can address many different types of files. I came up with the following data structure to hold a language definition or rather the comment markers, extension, and in some languages a marker to indicate the end of code:

"C": {
    "bcomm":        {"/*": "*/"},
    "comment":      ["//"],
    "endcode":      [],
    "ext":          [".c"],
    },

The command line interface is:

usage: codecount.py [-h] [-f | -g | -l | -o] [-t] [-d] [-i]
[-m {xml,json,yaml}]
[path]

A Source Code Counter

positional arguments:
path                  Provide a file path to count the code

optional arguments:
-h, --help            show this help message and exit
-f, --byfile          Totals for each file - Default
-g, --bygroup         Totals for each directory
-l, --bylang          Totals for each language
-o, --output-languages
Dump the known language file
-t, --time            Print the runtime at completion
-d, --debug           Turn on debug output
-i, --include         Include duplicate files
-m {xml,json,yaml}, --markup {xml,json,yaml}
Produce output in markup format

The source code for this project is

#!/usr/bin/env python3
###############################################################################
# Copyright 2013 Cory Lutton                                                  #
#                                                                             #
# Licensed under the Apache License, Version 2.0 (the "License");             #
# you may not use this file except in compliance with the License.            #
# You may obtain a copy of the License at                                     #
#                                                                             #
#    http://www.apache.org/licenses/LICENSE-2.0                               #
#                                                                             #
# Unless required by applicable law or agreed to in writing, software         #
# distributed under the License is distributed on an "AS IS" BASIS,           #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.    #
# See the License for the specific language governing permissions and         #
# limitations under the License.                                              #
###############################################################################
"""
Count the lines in text type files recursively though a directory
based on the extension of the file.
"""


__version__ = "0.5"

import os
import sys
import time
import json
import logging
import argparse
import hashlib
import xml.etree.ElementTree as ET


class CodeCounter:
    """ Main Code Counter Class """
    ignoredirs = (
            ".hg",
            ".bzr",
            ".svn",
            ".git",
            "__pycache__")
    langs = {
        "C": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".c"],
            },
        "C++": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".cpp"],
            },
        "C#": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".cs"],
            },
        "C Header": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".h"],
            },
        "CSS": {
            "bcomm":        {"/*": "*/"},
            "comment":      [],
            "endcode":      [],
            "ext":          [".css"],
            },
        "Cython": {
            "bcomm":        {},
            "comment":      ["#"],
            "endcode":      [],
            "ext":          [".pyx"],
            },
        "Go": {
            "bcomm":        {},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".go"],
            },
        "HTML": {
            "bcomm":        {'<!--': '-->'},
            "comment":      [],
            "endcode":      [],
            "ext":          [".html", ".htm", ".jinja"],
            },
        "IBM Macro": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["'"],
            "endcode":      [],
            "ext":          [".mac"],
            },
        "Java": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".java"],
            },
        "Javascript": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//"],
            "endcode":      [],
            "ext":          [".js"],
            },
        "Perl": {
            "bcomm":        {},
            "comment":      ["//", "#"],
            "endcode":      ["__END__"],
            "ext":          [".pl"],
            },
        "PHP": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["//", "#"],
            "endcode":      [],
            "ext":          [".php"],
            },
        "Python": {
            "bcomm":        {},
            "comment":      ["#"],
            "endcode":      [],
            "ext":          [".py", ".pyw"],
            },
        "RPG": {
            "bcomm":        {},
            "comment":      [
                "*", "‚*", "C*", "c*", "D*", "d*", "F*", "f*", "H*", "h*"],
            "endcode":      [],
            "ext":          [".rpgle", ".sqlrpgle"],
            },
        "Ruby": {
            "bcomm":        {},
            "comment":      ["#"],
            "endcode":      ["__END__"],
            "ext":          [".rb"],
            },
        "SQL": {
            "bcomm":        {},
            "comment":      [],
            "endcode":      [],
            "ext":          [".sql"],
            },
        "Text": {
            "bcomm":        {},
            "comment":      [],
            "endcode":      [],
            "ext":          [".txt"],
            },
        "VB": {
            "bcomm":        {"/*": "*/"},
            "comment":      ["'"],
            "endcode":      [],
            "ext":          [".vb", ".mac", ".frm", ".bas"],
            },
        "XML": {
            "bcomm":        {'<!--': '-->'},
            "comment":      [],
            "endcode":      [],
            "ext":          [".xml", ".csproj"],
            },
        "Yaml": {
            "bcomm":        {},
            "comment":      ["#"],
            "endcode":      [],
            "ext":          [".yaml"],
            }
        }

    def __init__(self):
        self.filelist = []
        self.tfiles = 0
        self.tlines = 0
        self.tcode = 0
        self.tcomments = 0
        self.tblanks = 0
        self.args = None

    def run(self):
        """ Performs a code count """
        self.commandline()
        starttime = time.time()

        if self.args.debug:
            logging.basicConfig(level=logging.DEBUG)

        self.listfiles()

        self.output()

        for filename in self.filelist:
            self.scanfile(filename)

        if not self.args.include:
            self.remove_duplicates()

        for filename in self.filelist:
            if filename.lines > 0:
                self.tfiles += 1

            self.tlines += filename.lines
            self.tcode += filename.code
            self.tcomments += filename.comments
            self.tblanks += filename.blanks

        assert(self.tlines - self.tcode - self.tcomments - self.tblanks == 0)

        self.report()

        if self.args.time:
            print("Runtime: {} seconds".format(
                str(round(time.time() - starttime, 3))))

# -----------------------------------------------------------------------------
#   Processing methods
# -----------------------------------------------------------------------------
    def commandline(self):
        """ Parse the command line arguments """
        parser = argparse.ArgumentParser(description="A Source Code Counter")

        # Positional
        parser.add_argument('path',
                nargs='?',
                default=".",
                help='Provide a file path to count the code')

        # Output options
        outgroup = parser.add_mutually_exclusive_group()

        outgroup.add_argument("-f", "--byfile",
                action="store_true",
                default=True,
                help="Totals for each file - Default")

        outgroup.add_argument("-g", "--bygroup",
                action="store_true",
                default=False,
                help="Totals for each directory")

        outgroup.add_argument("-l", "--bylang",
                action="store_true",
                help="Totals for each language")

        outgroup.add_argument("-o", "--output-languages",
                action="store_true",
                help="Dump the known language file")

        # General Options
        parser.add_argument("-t", "--time",
                action="store_true",
                default=True,
                help="Print the runtime at completion")

        parser.add_argument("-d", "--debug",
                action="store_true",
                help="Turn on debug output")

        parser.add_argument("-i", "--include",
                action="store_true",
                help="Include duplicate files")

        parser.add_argument("-m", "--markup",
                choices=["xml", "json", "yaml"],
                help="Produce output in markup format")

        self.args = parser.parse_args()

    def listfiles(self):
        """ Get the list of files to check. """
        if os.path.isdir(self.args.path):
            logging.info("Path walked: " + self.args.path)
            self.walk(self.args.path)
        else:
            root, filename = os.path.split(self.args.path)
            if os.path.isfile(root + '/' + filename):
                self.filelist.append(Filename(self.args.path, root, filename))

    def walk(self, path):
        """ Walk the path excluding certain known directories """
        for root, folders, files in os.walk(path):

            # Must go from ignoredirs
            for folder in self.ignoredirs:
                if folder in folders:
                    logging.info("Skipping Folder: " + folder)
                    folders.remove(folder)

            # Append the file info
            for filename in files:
                if os.path.isfile(root + '/' + filename):
                    self.filelist.append(Filename(path, root, filename))

    def scanfile(self, filename):
        """ The heart of the codecounter, Scans a file to identify
        and collect the metrics based on the classification. """
        strblock = None
        endblock = None
        inblock = False
        endcode = False
        sha256 = hashlib.sha256()

        if filename.size == 0:
            logging.info("Skipping File  : " + filename.name)
            return

        # Identify language
        for l in self.langs:
            if filename.extension in self.langs[l]["ext"]:
                filename.lang = l
                break

        # Unknown files don't need processed
        if filename.lang is None:
            logging.info("Skipping File  : " + filename.name)
            return

        # Using the with file opening in order to ensure no GC issues.
        with open(os.path.join(filename.path, filename.name),
                encoding="utf-8", errors='ignore') as fp:

            for line in fp:
                sha256.update(line.encode("utf-8"))
                filename.lines += 1
                line = line.strip()
                identified = False

                if line == "":
                    logging.info(" blak  " + str(filename.lines))
                    filename.blanks += 1
                    continue

                if endcode:
                    filename.comments += 1
                    continue

                # Check to see if it is a block or was an opening block
                # ex1 = "/*  */ if x;"          = Code, not inblock
                # ex2 = "*/ if x; /*"           = Code, inblock
                # ex3 = " if x; /*"             = Code, inblock
                # ex4 = "/*  */ if x; /* */ .." = Code, not inblock
                # ex4 = "*/"                    = Comment, not inblock
                # ex5 = "/* */"                 = Comment, not inblock
                # ex6 = "/*"                    = Comment, inblock
                # Two scenarios,
                # 1 - comments removed, code remains
                # 2 - Comments removed but block is open
                if not inblock:
                    for token in self.langs[filename.lang]["bcomm"]:
                        strblock = token
                        endblock = self.langs[filename.lang]["bcomm"][token]

                        while token in line:
                            spos = line.find(strblock)
                            epos = line.find(endblock, spos) + len(endblock)
                            # If a block has started then check for an exit
                            if endblock in line:
                                line = line.replace(line[spos: epos], "", 1)
                            else:
                                line = line.replace(line[spos:], "", 1)
                                inblock = True      # left open
                else:
                    # Continue until the block ends... when left open
                    if endblock in line:
                        inblock = False     # End the block
                        line = line.replace(
                                line[:line.find(endblock) + len(endblock)],
                                "").strip()
                    else:
                        line = ""

                # From the block but no hidden code made it out the back....
                if line is "":
                    logging.info(" bloc  " + str(filename.lines) + line)
                    filename.comments += 1
                    continue

                # Check line comment designators
                for token in self.langs[filename.lang]["comment"]:
                    if line.startswith(token):
                        logging.info(" line  " + str(filename.lines) + line)
                        filename.comments += 1
                        identified = True
                        break

                if identified:
                    continue

                # If not a blank or comment it must be code
                logging.info(" code  " + str(filename.lines) + line)
                filename.code += 1

                # Check for the ending of code statements
                for end in self.langs[filename.lang]["endcode"]:
                    if line == end:
                        endcode = True

        # Store the hash of this file for comparison to others
        logging.info("Total  "
                + " " + str(filename.blanks)
                + " " + str(filename.comments)
                + " " + str(filename.code))

        filename.sha256 = sha256.digest()

# -----------------------------------------------------------------------------
#   Reporting Methods
# -----------------------------------------------------------------------------
    def report(self):
        """ Produce the report. """
        if self.args.bylang:
            self.report_lang()
        elif self.args.bygroup:
            self.report_dir()
        elif self.args.byfile:
            self.report_file()

    def remove_duplicates(self):
        """ Remove duplicate files from the files list """
        unique = {}
        uniquelist = []

        for filename in self.filelist:
            if filename.sha256 not in unique:
                unique[filename.sha256] = filename

        for u in unique:
            uniquelist.append(unique[u])

        self.filelist = uniquelist

    def report_header(self, reporttype):
        """ Return the header for a report """
        return ("Codecount - v " + __version__ + "\n"
                + "-" * 79 + "\n"
                + "{:<29}".format(reporttype)
                + "{:>10}".format("Files")
                + "{:>10}".format("Blank")
                + "{:>10}".format("Comment")
                + "{:>10}".format("Code")
                + "{:>10}".format("Lines") + "\n"
                + "-" * 79)

    def report_detail(self, text, files, blanks, comments, code, lines):
        """ Return a detail line. """
        return ("{:<29}".format(text)
                + "{:>10}".format(files)
                + "{:>10}".format(blanks)
                + "{:>10}".format(comments)
                + "{:>10}".format(code)
                + "{:>10}".format(lines))

    def report_summary(self):
        """ Return the summary """
        return ("-" * 79 + "\n"
                + "{:<29}".format("Totals")
                + "{:>10}".format(self.tfiles)
                + "{:>10}".format(self.tblanks)
                + "{:>10}".format(self.tcomments)
                + "{:>10}".format(self.tcode)
                + "{:>10}".format(self.tlines) + "\n"
                + "-" * 79)

    def report_file(self):
        """ Run a report by file """

        print(self.report_header("By File"))

        for filename in sorted(self.filelist,
                key=lambda filename: filename.code, reverse=True):

            if filename.lang is None:
                continue

            print(self.report_detail(
                filename.shortname,
                1,
                filename.blanks,
                filename.comments,
                filename.code,
                filename.lines))

        print(self.report_summary())

    def report_dir(self):
        """ Run a report by directory """

        print(self.report_header("By Directory"))

        # Subtotal by directory
        total = {"dir": None, "blank": 0, "comment": 0,
                "file": 0, "code": 0, "line": 0}

        for filename in sorted(self.filelist,
                key=lambda filename: str(filename.shortpath)):

            if filename.shortpath is None:
                continue

            if filename.shortpath == total["dir"]:
                total["blank"] += filename.blanks
                total["comment"] += filename.comments
                total["file"] += 1
                total["code"] += filename.code
                total["line"] += filename.lines
            else:
                # Print collected data before resetting.
                if total['dir']:
                    print(self.report_detail(
                        "",
                        total["file"],
                        total["blank"],
                        total["comment"],
                        total["code"],
                        total["line"]))
                    print("-" * 79)

                total["dir"] = filename.shortpath
                total["blank"] = filename.blanks
                total["comment"] = filename.comments
                total["file"] = 1
                total["code"] = filename.code
                total["line"] = filename.lines
                print("{:<78}".format(total["dir"]))

        # Print the last
        if total['dir']:
            print(self.report_detail(
                "",
                total["file"],
                total["blank"],
                total["comment"],
                total["code"],
                total["line"]))

        print(self.report_summary())

    def report_lang(self):
        """ Run a report by language """

        print(self.report_header("By Language"))

        # Subtotal by language
        total = {"lang": None, "blank": 0, "comment": 0,
                "file": 0, "code": 0, "line": 0}
        for filename in sorted(self.filelist,
                key=lambda filename: str(filename.lang)):

            if filename.lang is None:
                continue

            if filename.lang == total["lang"]:
                total["blank"] += filename.blanks
                total["comment"] += filename.comments
                total["file"] += 1
                total["code"] += filename.code
                total["line"] += filename.lines
            else:
                # Print collected data before resetting.
                if total['lang']:
                    print(self.report_detail(
                        total["lang"],
                        total["file"],
                        total["blank"],
                        total["comment"],
                        total["code"],
                        total["line"]))

                total["lang"] = filename.lang
                total["blank"] = filename.blanks
                total["comment"] = filename.comments
                total["file"] = 1
                total["code"] = filename.code
                total["line"] = filename.lines

        # Print the last.
        if total['lang']:
            print(self.report_detail(
                total["lang"],
                total["file"],
                total["blank"],
                total["comment"],
                total["code"],
                total["line"]))

        print(self.report_summary())

# -----------------------------------------------------------------------------
#   Other Output Methods
# -----------------------------------------------------------------------------
    def output(self):
        """ Output rather than report. """
        if self.args.output_languages:
            self.output_json_langs()
            sys.exit("File languages.json created.")
            self.output_xml_langs()
            sys.exit("File languages.xml created.")

    def output_json_langs(self):
        """ Output languages as a JSON file. """
        outputfile = open("languages.json", "w")
        json.dump(self.langs, outputfile, indent=4, sort_keys=True)
        outputfile.close()

    def output_xml_langs(self):
        """ Output languages as a JSON file. """
        outputfile = open("languages.xml", "w", encoding="utf-8")
        root = ET.Element('root')
        child = ET.SubElement(root, 'child')
        child.attrib['name'] = "Charlie"
        tree = ET.ElementTree(root)
        tree.write(outputfile, encoding="unicode")
        outputfile.close()


# -----------------------------------------------------------------------------
#   Helpers
# -----------------------------------------------------------------------------
class Filename():
    """ Setup a filename to collect the info """

    def __init__(self, argpath, path, filename):
        self.path = path
        self.name = filename
        self.extension = os.path.splitext(filename)[1].lower()
        self.lang = None
        self.lines = 0
        self.comments = 0
        self.blanks = 0
        self.code = 0
        self.size = os.stat(os.path.join(path, filename)).st_size
        self.sha256 = None

        self.shortpath = "." + path[len(argpath): len(argpath) + 78].strip()

        if len(filename) > 30:
            self.shortname = filename[:20].strip() + "~" + self.extension
        else:
            self.shortname = filename


# Run the main routine as the starting point
if __name__ == "__main__":
    # import cProfile
    cc = CodeCounter()
    cc.run()
    # cProfile.run('cc.run()')