#!/usr/bin/python

# Copyright 2008-2009 Junior (Frederic) FLEURIAL MONFILS
#
# This file is part of PyDepend.
#
# PyDepend is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# or see <http://www.opensource.org/licenses/gpl-3.0.html>
#
# Contact:
#     Junior FLEURIAL MONFILS <frederic dot fleurialmonfils at cetic dot be>
#     Jean-Christophe DEPREZ <jean-christophe dot deprez at cetic dot be>

"""This module lists the dependencies between packages found in the source code
The dependencies are reported in dot format.

Depending on the -d switch
- False (default): a package is considered as a single source file
- True: a package is considered as a directory

It also reports, in csv format, a set of metrics for these packages:
- Ca: the number of packages using the given package
- Ce: the number of packages used by a given package
- I: the instability of a given package (Ce/Ca+Ce)
- Idep: the number of illegal dependencies for a given package, in other
        words, this is the number of dependencies it has with a package having
        a higher instability I than it has.
See usage for options, or type pycouplings.py --help
"""

import os
from os.path import basename, dirname, isdir, join, realpath, splitext, sep
import sys
import re
from itertools import chain
import csv
import compiler

def usage(message=""):
    """Print the usage message on screen and stop the program
    """
    print >> sys.stderr, """Usage: %s options

Produces a DOT file with the dependencies between packages
by analysing the input source files and their 'import/include' declarations.
The DOT output can be used by sccmap and other tools to find cycles, etc.

Produces also a CSV output with basic metrics: Ca, Ce, I, and Idep
where Idep are the number of illegal dependencies for the given package.

The input should be a list of source files for example generated by
 $ find /path/to/src -name *.java|cpp|py (Unix) or
 > dir /b/s \\path\\to\\src\\*.java|cpp|py (Windows)

The DOT and CSV content is output to STDOUT if not otherwise stated

Options:
  -l LANGUAGE  programming language of the source code [java, cpp, python]
  -f PATH      file containing one line per file to parse
  -c REGEXP    regular expression of counted dependencies
  -e REGEXP    regular expression of excluded dependencies
  --internal   only list dependencies internal to the project (default: False)
  --all        count all dependencies for couplings (default: False)
  --files      packages are files instead of directories (default: False)
  --fast       speed up computation (default: False)
  --dot FILE   report DOT content to the specified file (default: stdout)
  --csv FILE   report CSV content to the specified file (default: stdout)
  --cleanup    remove comments and strings in source file before processing
               note: setting this option reduces performance on large projects
                     only relevant when -l is set to 'java' or 'cpp'
               (default: False)
  -h, --help   print this help%s""" % (
        basename(sys.argv[0]),
        message and ("\n\n%s" % message) or ""
    )
    sys.exit(1)

# ==============================================================================
#                           CREATES THE VARIOUS MAPS USED THROUGHOUT THE PROGRAM
# ==============================================================================

# ------------------------------------------------------------------------------
# Create the REGEXP map
# ------------------------------------------------------------------------------

# Map with the needed regular expressions (only for cpp and java)
#
# MLCOM is a procedure that takes the size of the string and returns the RE
# SLCOM, STRING, IMPORT and PACKAGE are mostly language dependent
REGEXP = {}

def init_REGEXP():
    cpp=dict(
        MLCOM=(lambda size: re.compile(r"(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n]))){0,%d}?\*+/)" % size, re.M)),
        SLCOM=re.compile(r"(//[^\r\n]*)"),
        STRING=re.compile(r'(?<!#include\s)"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"'),
        IMPORT=re.compile(r'^\s*#include\s+("[^"]+"|<[^>]+>)', re.M),
        PACKAGE=re.compile(r'.*')
    )
    java = dict(
        MLCOM=cpp["MLCOM"],
        SLCOM=cpp["SLCOM"],
        STRING=re.compile(r'"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"'),
        IMPORT=re.compile(r"^\s*import(?:\s+(static))?\s+([\w.*]+)\s*;", re.M),
        PACKAGE=re.compile(r"^\s*package\s+([\w.]+)\s*;", re.M)
    )
    REGEXP.update(cpp=cpp, java=java)

init_REGEXP()

# ------------------------------------------------------------------------------
# Create the GET_PACKAGE_NAME map
# ------------------------------------------------------------------------------

def cpp__get_package_name(filepath, options, len_common_prefix, source):
    """This function is used to return the packagename from a given name (for cpp)

    @param filepath: full path of the file
    @param options: list of options provided at command line
    @param len_common_prefix: len of the common prefix of all the files
                              in the project
    @param source: source of the file located at <filepath>

    @return the dotted version of the path to
            - the directory containing the file (-d is set)
            - or the filename without the extension (-d not set)
    """
    return (
        options.files
        and splitext(filepath)[0][len_common_prefix:].replace(sep, ".")
        or dirname(filepath)[len_common_prefix:].replace(sep, ".")
    )

python__get_package_name = cpp__get_package_name

def java__get_package_name(filepath, options, len_common_prefix, source):
    """This function is used to return the packagename from a given name (for java)

    @param filepath: full path of the file
    @param options: list of options provided at command line
    @param len_common_prefix: len of the common prefix of all the files in then project
    @param source: source of the file located at <filepath>

    @return the dotted version of the path to
            - the directory containing the file (-d is set)
            - or the filename without the extension (-d not set)
    """
    match = REGEXP["java"]["PACKAGE"].search(source)
    package = None
    if match:
        if options.files:
            package = '%s.%s' % (
                match.group(1),
                splitext(basename(filepath))[0]
            )
        else:
            package = '%s' % match.group(1)
    else: # no package defined, use the default package
        if options.files:
            package = '__Default__.%s' % splitext(filepath)[0][len_common_prefix:].replace(sep, ".")
        else:
            package = '__Default__'
    return package

# Map with the appropriate get_package_name function based on the language
GET_PACKAGE_NAME = dict(
    java=java__get_package_name,
    cpp=cpp__get_package_name,
    python=python__get_package_name
)

# ------------------------------------------------------------------------------
# Create the CLEANUP_SOURCE map
# ------------------------------------------------------------------------------

def cpp__cleanup_source(source, options):
    """If needed, remove comments and strings from the source, keep line numbers

    @param source: string containing the contents of the source file
    @param options: options provided at command line
                    used: cleanup, l

    @return the source with the comments and strings removed if options.cleanup
            was True, otherwise, return the source unchanged
    """
    if options.cleanup:
        return language__cleanup_source(source, options.l)
    else:
        return source

def language__cleanup_source(source, language):
    """Remove comments and strings from the source, preserving line numbers

    @param source: string containing the contents of the source file
    @param language: programming language should be one of (java, cpp)

    @return the source with the comments and strings removed
    """
    start = 0
    reMLCOM = REGEXP[language]["MLCOM"](len(source)-start)
    reSLCOM = REGEXP[language]["SLCOM"]
    reSTRING = REGEXP[language]["STRING"]

    while True:
        matchML = reMLCOM.search(source, start)
        matchSL = reSLCOM.search(source, start)
        matchS = reSTRING.search(source, start)

        matchStart = min(
            (matchML and (matchML.start(),) or (2000000,))[0],
            (matchSL and (matchSL.start(),) or (2000000,))[0],
            (matchS and (matchS.start(),) or (2000000,))[0]
        )

        if matchML and (matchML.start() == matchStart):
            comment = matchML.group()
            source = reMLCOM.sub("/*%s*/" % ("*\n"*comment.count("\n")), source, 1)
            start = matchStart + 5 + (2*comment.count("\n"))
        elif matchSL and (matchSL.start() == matchStart):
            comment = matchSL.group()
            source = reSLCOM.sub("//", source, 1)
            start = matchStart + 3
        elif matchS and (matchS.start() == matchStart):
            string = matchS.group()
            source = reSTRING.sub('""', source, 1)
            start = matchStart + 3
        else:
            break

    return source

java__cleanup_source = cpp__cleanup_source
python__cleanup_source = lambda s, options: None

CLEANUP_SOURCE = dict(
    java=java__cleanup_source,
    cpp=cpp__cleanup_source,
    python=python__cleanup_source
)

# ------------------------------------------------------------------------------
# Create the LIST_IMPORTED_MODULES map
# ------------------------------------------------------------------------------

def java__list_imported_modules(filepath, source, options):
    """Get the list of modules imported by the source of filepath in Java or C/C++

    @param filepath: path to the file under analysis
    @param source: string contents of filepath
    @param options: options provided at command line
                    used: l

    @return the list of imported module names
    """
    # start is used as a cursor to find successive matches
    start = 0
    imports = []
    while True:
        # test if this file imports something
        match = REGEXP[options.l]["IMPORT"].search(source, start)
        if match:
            imports.append(match.groups())
            start = match.end()
        else:
            # this file does not import anything, we are finished
            break
    return imports

cpp__list_imported_modules = java__list_imported_modules

class ImportVisitor:
    """ImportVisitor is used to collect the names of all imported modules
    It is used by compiler.walk
    """
    def __init__(self):
        """Creates the visitor and initialize the 'modules' set"""
        self.modules = set()
    def visitFrom(self, node, *args):
        """visits a From node, extracts its module name
        from Name import (*|subname [as alias] [, subname [as alias]] ...)
        """
        # node.modname contains the name of the imported module
        self.modules.add(node.modname)
    def visitImport(self, node, *args):
        """visits an Import node, extracts all the module names
        import Name1 [as Alias1], Name2 [as Alias2]
        """
        # node.names contains the list [(name1, alias1), (name2, alias2), ...]
        self.modules.update(name for (name, alias) in node.names)

def python__list_imported_modules(filepath, source, options):
    """Get the list of modules imported by the source of filepath in Python

    @param filepath: path to the file under analysis
    @param source: string contents of filepath
    @param options: options provided at command line
                    used: l

    @return the list of imported module names
    """
    # get the compiled version of the file from its filepath
    ast = compiler.parseFile(filepath)
    # visit the compiled version and extracts the modules
    return compiler.walk(ast, ImportVisitor()).modules

LIST_IMPORTED_MODULES = dict(
    java=java__list_imported_modules,
    cpp=cpp__list_imported_modules,
    python=python__list_imported_modules
)

# ------------------------------------------------------------------------------
# Create the GET_IMPORTED_PACKAGE map
# ------------------------------------------------------------------------------

def java__get_imported_package(imported, options=None, len_common_prefix=None, search_from=None, get_package_name=None):
    """Get the dotted name of the imported package in Java

    @param imported: name of the import found in source file
    @param search_from: not used
    @param get_package_name: not used

    @return the name of the imported package
    """
    # this file imports another package, class or static field
    static, imported_module = imported
    # use a simple heuristic to find the imported package name
    imported_package = ".".join(
        [part for part in imported_module.split(".")[:(static and -2 or -1)]
                  if part != "*" and part[0] == part[0].lower()
        ]
    )
    return imported_package

def cpp__get_imported_package(imported, options, len_common_prefix, search_from, get_package_name):
    """Get the dotted name of the imported package in C/C++

    @param imported: name of the import found in source file
    @param search_from: tuple containing (current_directory, root)
                        current_directory contains the analysed file
                        root is the root of the project
    @param get_package_name: function that returns the package name for a given
                             filename: string -> string

    @return the name of the imported package
    """
    # use a simple heuristic to find the imported package name
    imported_package = None
    imported = imported[0]
    # try to find the package in the directory of the analysed package
    #                         and then in the root directory
    for path in search_from: # search_from = (directory, common_prefix)
        imported_file = realpath(join(path, imported[1:-1]))
        if os.path.exists(imported_file):
            imported_package = get_package_name(imported_file, options, len_common_prefix, None) or "$root"
            break
    if not imported_package:
        # this one is supposed to be standard for this project
        imported_package = (imported[0] == "<" and imported or imported[1:-1])
    return imported_package

def python__get_imported_package(imported, options, len_common_prefix, search_from, get_package_name):
    """Get the dotted name of the imported package in Python

    @param imported: name of the import found in source file
    @param search_from: tuple containing (current_directory, root)
                        current_directory contains the analysed file
                        root is the root of the project
    @param get_package_name: function that returns the package name for a given
                             filename: string -> string

    @return the name of the imported package
    """
    # use a simple heuristic to find the imported package name
    imported_package = None
    # try to find the package in the directory of the analysed package
    #                         and then in the root directory
    for path in search_from: # search_from = (directory, common_prefix)
        # test if the imported package is a file (this is then a file that ends with .py) or
        #      if the imported package is a python package (this is then a directory and it contains a file __init__.py)
        for extension in (".py", "%s%s"% (sep, "__init__.py")):
            imported_file = join(path, "%s%s" % (imported.replace(".", sep), extension))
            if os.path.exists(imported_file):
                imported_package = get_package_name(imported_file, options, len_common_prefix, None) or "$root"
                break
        if imported_package:
            break
    if not imported_package:
        # this one is supposed to be standard for this project
        imported_package = "<%s>" % imported
    return imported_package

GET_IMPORTED_PACKAGE = dict(
    java=java__get_imported_package,
    cpp=cpp__get_imported_package,
    python=python__get_imported_package
)

# ==============================================================================
#                                                        PERFORM THE ACTUAL WORK
# ==============================================================================

def run(options):
    """Perform the actual work.

    1. Parses all source files provided in input (options.i)
    2. Create a "package" for the file
    3. Store the dependencies to other packages by recording the
       '#include' or 'import' found in the files
    """

    # predicate used to specify if a given package should be included
    include = lambda name: True
    # predicate used to specify if a given package should be excluded
    exclude = lambda name: False

    # Activate these predicates based on the provided options -e and -c
    if options.c:
        pathfilter_include = re.compile(options.c)
        def include(name):
            return pathfilter_include.search(name)
    if options.e:
        pathfilter_exclude = re.compile(options.e)
        def exclude(name):
            return pathfilter_exclude.search(name)

    # get all the filenames to analyse, because we need to extract these
    # common prefix of all source files.
    filenames = [line.strip() for line in options.i]
    common_prefix = os.path.commonprefix(filenames)
    # L is the length of the common prefix
    L = len(common_prefix)

    language = options.l
    get_package_name = GET_PACKAGE_NAME[language]
    cleanup_source = CLEANUP_SOURCE[language]
    list_imported_modules = LIST_IMPORTED_MODULES[language]
    get_imported_package = GET_IMPORTED_PACKAGE[language]

    # imports is used to record all packages importing other packages
    imports = {}
    # packages is used to record all packages in the project
    packages = {}

    #
    # Get all imports and all packages from all files in the project
    # ==============================================================
    #

    for filepath in filenames:

        source = open(filepath).read()

        source = cleanup_source(source, options)

        # Get the package this file contributes to
        directory = dirname(filepath)
        package_name = get_package_name(filepath, options, L, source)
        package = '"%s"' % (package_name or "__Default__")
        if options.files:
            containing_package = '"%s"' % ".".join(package_name.split(".")[:-1])
            packages[containing_package] = True
        else:
            packages[package] = True

        modules = list_imported_modules(filepath, source, options)
        for imported in modules:
            imported_package = get_imported_package(imported, options, L, (directory, common_prefix, os.path.join(common_prefix, "include")), get_package_name)
            # we have here the package that is used
            used_package = '"%s"' % imported_package

            # Filter the imported packages to keep the ones specified
            # at the command line
            if (used_package == package) or exclude(imported_package) or not include(imported_package):
                continue

            # Insert this used_package in the list of packages used by
            # this currently defined package
            #   1. get map of used packages
            try:
                used_packages = imports[package]
            except KeyError:
                used_packages = imports[package] = {}
            #   2. insert used_package in the list of used packages
            #      and increment its usage count by the current package
            try:
                used_packages[used_package] += 1
            except KeyError:
                used_packages[used_package] = 1

    #
    # Report graph
    # ============
    #
    print >>options.dot, "digraph map {"
    for (package, used_packages) in imports.iteritems():
        for (used_package, occurrence) in used_packages.iteritems():
            # if we only want internal packages (--internal is set)
            # don't show the dependencies to external packages
            if options.internal and (used_package not in packages):
                continue
            print >>options.dot, "%s -> %s;" % (package, used_package)
    print >>options.dot, "}"

    #
    # Compute couplings
    # =================
    #
    afferent = {}
    efferent = {}
    for (package, used_packages) in imports.iteritems():
        if options.internal:
            used_packages = dict(
                (u, o)
                for (u, o) in used_packages.iteritems()
                if u in packages
            )
        ce = 0
        for (used_package, occurrence) in used_packages.iteritems():
            increment = (options.all and (occurrence,) or (1,))[0]
            try:
                afferent[used_package] += increment
            except KeyError:
                afferent[used_package] = increment
            if options.all:
                ce += occurrence
        efferent[package] = (options.all and (ce,) or (len(used_packages),))[0]
    metrics = dict.fromkeys(chain(afferent.keys(), efferent.keys()))
    for package in metrics:
        ca = afferent.get(package,0)
        ce = efferent.get(package, 0)
        try:
            i = float(ce)/(ca+ce)
        except ZeroDivisionError:
            i = 0
        metrics[package] = (ca, ce, i, None)
    for package in metrics:
        idep = 0
        ca, ce, i, _ = metrics[package]
        used_packages = imports.get(package, {})
        if options.internal:
            used_packages = dict(
                (u, o)
                for (u, o) in used_packages.iteritems()
                    if u in packages
            )
        for (used_package, occurence) in used_packages.iteritems():
            _, _, j, _ = metrics[used_package]
            if i < j: # this dependency is illegal
                increment = (options.all and (occurence,) or (1,))[0]
                idep += increment
        metrics[package] = (ca, ce, i, idep)

    #
    # Report couplings
    # ================
    #
    w = csv.writer(options.csv, dialect="excel")
    w.writerow(["Package", "Afferent", "Efferent", "Instability", "IllegalDependencies"])
    w.writerows((package[1:-1], ca, ce, i, idep) for (package, (ca, ce, i, idep)) in metrics.iteritems())

# ==============================================================================
#                  ANALYSE THE COMMAND LINE ARGUMENTS AND LAUNCH THE COMPUTATION
# ==============================================================================

if __name__ == "__main__":
    import getopt
    short = "f:c:e:l:h"
    try:
        # parse the options provided at command line
        opts, args = getopt.getopt(
            sys.argv[1:],
            short,
            [
                "help",
                "fast",
                "files",
                "all",
                "internal",
                "cleanup",
                "dot=",
                "csv="
            ]
        )
    except getopt.GetoptError, e:
        # if any error, print usage with error text and exit
        usage("Error: %s" % e)
    class options:
        """Basic options class used to collect options from command line
        """
        l=None          # language of the source code files
        f=None          # path to file containing the list of filenames
        e=None          # regexp to exclude imported package from count
        c=None          # regexp to include imported package into count
        all=False       # count all dependencies for couplings
        internal=False  # list project only packages
        files=False     # packages are files instead of directories
        fast=False      # don't speed up
        cleanup=False   # don't remove comments and strings before pattern matching
        dot=None
        csv=None

        i=sys.stdin     # input is always standard input
    # insert every arguments from command line to options class
    for (opt, arg) in opts:
        if opt == "-h" or opt == "--help":
            # if help is required, show usage and exit
            usage()
        elif opt == "--fast":
            options.fast = True
        elif opt == "--files":
            options.files = True
        elif opt == "--internal":
            options.internal = True
        elif opt == "--all":
            options.all = True
        elif opt == "--cleanup":
            options.cleanup = True
        # otherwise, set the arg as value of the options class
        else:
            setattr(options, opt.startswith("--") and opt[2:] or opt[1:], arg)
    # check that all mandatory options are present
    if not options.l:
        usage("Error: Missing mandatory option -l")
    # check provided programming language
    if options.l not in ("java", "cpp", "python"):
        usage("Error: Unknown language '%s'. Should be one of (java, cpp, python)" % options.l)
    # check that provided path exists
    if options.f:
        if not os.path.exists(options.f):
            usage("Error: Path not found '%s'" % options.f)
        # check that provided input file is not a directory
        if os.path.isdir(options.f):
            usage("Error: Path '%s' is a directory" % options.f)
        options.i = open(options.f)
    if options.dot:
        # check that provided dot file is not a directory
        if os.path.isdir(options.dot):
            usage("Error: Path '%s' is a directory" % options.dot)
        options.dot = open(options.dot, "w")
    else:
        options.dot = sys.stdout
    if options.csv:
        # check that provided csv file is not a directory
        if os.path.isdir(options.csv):
            usage("Error: Path '%s' is a directory" % options.csv)
        options.csv = open(options.csv, "w")
    else:
        options.csv = sys.stdout
    if options.fast:
        try:
            import psyco
            psyco.full()
        except ImportError:
            print >>sys.stderr, "Warning: Psyco not found, running at normal speed"
    # everything is checked, do the actual job
    run(options)
