# hstats.py -- analyse profiling data from hotshot

# Copyright (c) 2005 Floris Bruynooghe

# All rights reserved.

# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, provided
# that the above copyright notice(s) and this permission notice appear
# in all copies of the Software and that both the above copyright
# notice(s) and this permission notice appear in supporting
# documentation.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR
# ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.

# Except as contained in this notice, the name of a copyright holder
# shall not be used in advertising or otherwise to promote the sale,
# use or other dealings in this Software without prior written
# authorization of the copyright holder.

"""hstats - examine and print reports on code profiled by hotshot."""


import exceptions
import hotshot.log as log
import sys
import os.path


class HstatsError(exceptions.Exception):
    """Base class for exceptions of the hstats module."""
    pass


class CorruptFileError(HstatsError):
    """The File is corrupt or contains invalid data."""
    pass


# These are the indices of the Stats._data['func_name'] list
_SDATA_CALL = 0
_SDATA_TIME = 1
_SDATA_CUMTIME = 2
_SDATA_PARENTS = 3


class Stats:
    """Class to examine profiling reports from hotshot.

    The data this class can extract out of a profile is the
    following:

    call - The number of times a frame was called.  The first number
      is the number off calls, the second the number of recursive
      calls.

    time - Total time spent in a frame.

    avgtime - Average time spent in a frame (=time/ncalls)

    cumtime - Time spend in a frame including time spent in frames
      called from this frame (cumulative time).

    avgcumtime - Average cumulative time (=cumtime/ncalls)

    name - filename, line number and name of a frame.

    Note: a frame for the profiler can be a function of a method.

    Note: line events are not supported currently.  Reading a profile
      with line events will thus raise a NotImplementedError.
    """
    def __init__(self, filename, keep_exec=False):
        """Create Stats object

        filename - Name of file containing the hotshot profiling data.

        keep_exec - When profiling using a string (like
          Profiler.run()) this alsways gets stored as a function call
          named <string>.  The default behaviour is to delete this
          fake root call.  However the old profiler module did have
          it, so this is a obsolete backwards compatibility option.
          If you really inted to use this check out the timeit module
          instead.
        """
        # Frame -> timing info
        # {name: [ncall, rcall, time, cumtime], ...}
        self._data = {}

        self._reader = log.LogReader(filename)
        self.WHAT = 0
        self.NAME = 1
        self.TDELTA = 2

        self._framestack = self._reader._stack
        self._cumstack = []

        # Read the file in into self._data.  On ENTER the tdelta gets
        # added to the TIME of the parent *and* to the cumstack of the
        # parent.  On EXIT a frame adds it's cumstack to it's CUMTIME.
        for f in self._reader:
            name = f[self.NAME]
            what = f[self.WHAT]
            tdelta = f[self.TDELTA]

            if what == log.ENTER:
                self._cumstack.append(0)
                if name not in self._data:
                    # New frame
                    self._data[name] = [[1, 0], 0, 0, {}]
                    if len(self._framestack) > 1:
                        # Not root frame
                        parent = self._framestack[-2]
                        
                        self._data[parent][_SDATA_TIME] += tdelta
                        self._cumstack[-2] += tdelta
                        self._data[name][_SDATA_PARENTS] = {parent: 1}
                else:
                    # Frame exists already
                    frame = self._data[name]
                    parent_name = self._framestack[-2]
                    parent = self._data[parent_name]
                    
                    parent[_SDATA_TIME] += tdelta
                    self._cumstack[-2] += tdelta
                    frame_parents = frame[_SDATA_PARENTS]
                    if parent_name in frame_parents.keys():
                        frame_parents[parent_name] += 1
                    else:
                        frame_parents[parent_name] = 1
                    frame[_SDATA_CALL][0] += 1
                    if name == parent_name:
                        # Recursive call
                        frame[_SDATA_CALL][1] += 1

            elif what == log.EXIT:
                if name not in self._data:
                    raise CorruptFileError, "EXIT frame without earlier ENTER."
                self._data[name][_SDATA_TIME] += tdelta
                self._cumstack[-1] += tdelta
                if len(self._framestack) > 0:
                    # Not root frame
                    self._cumstack[-2] += self._cumstack[-1]
                self._data[name][_SDATA_CUMTIME] += self._cumstack.pop()

            elif what == log.LINE:
                raise NotImplementedError, "lineevents currently unsupported."

        if not keep_exec:
            if ('<string>', 1, '?') in self._data.keys():
                del self._data[('<string>', 1, '?')]
        if len(self._data) == 0:
            raise EOFError, "no profiling data found in %s" % filename

    def show(self, sort=['time', 'name'], weed=['dirs'], limit=20):
        """Print the statistics on the screen.

        sort - Order by wich sorting is done.  This is a list of one
          or more of the items listed in the class docstring.  Sorting
          is first done on the first entry of the list, if multiple
          entries have the same value sorting within this group is
          done by the second criterium etc.  All numbers are sorted
          descending and the name is sorted ascending.  If this is
          'None' the data is unsorted.  For more detailed sorting see
          .sort_data().

        weed - What you don't want to see.  See the docstring of
          .get_data() for this parameter.

        limit - The number of entries shown.  Set this to 0 to see
          everything.
        """
        valid_sortables = ['call', 'time', 'avgtime',
                           'cumtime', 'avgcumtime', 'name']
        for thing in sort:
            if thing not in valid_sortables:
                raise HstatsError, "%s is an invalid sort option." % thing
            if sort.count(thing) > 1:
                raise HstatsError, "multiple occurences of same sort keyword."

        data, dd = self.get_data(weed=weed)
        order = []
        for thing in sort:
            i = dd.index(thing)
            if thing == 'name':
                d = 'a'
            else:
                d = 'd'
            order.append((i, d))
        data = self.sort_data(data, order=order)
        self.print_data(data, dd, limit=limit)

    def get_data(self, weed=None, extra=None):
        """Retrieve the data in a sortable form.

        weed - What data not to return.  Can be a list of any
          combination of the following: 'dirs', 'name', 'call',
          'time', 'avgtime', 'cumtime', 'avgcumtime'.  Note that you
          can not weed 'ncall' withouth weeding 'rcall', this is
          implicitely enforced.

        extra - Extra data that has to be included too but is not by
          default.  This is a list of the items requested extra.
          Currenlty the only know value is 'parents', this will add a
          list of the names of the parents to each data row.

        A tuple of the data and the data description is returned.  The
        data description is a list of strings specified in the
        docstring of this class.  The order of this list is the order
        in which each row in the data is sorted.
        """
        valid_weeding = ['dirs', 'name', 'call', 'time',
                         'avgtime', 'cumtime', 'avgcumtime']
        valid_extras = ['parents']
        if weed == None:
            weed = []
        if not isinstance(weed, (list, tuple)):
            raise ValueError, "weed argument is not a list: %s" % weed
        for thing in weed:
            if thing not in valid_weeding:
                raise HstatsError, "%s is not a valid weeding keyword." % thing
            while weed.count(thing) > 1:
                del weed[weed.index(thing)]
        if weed.count('name') == 1 and weed.count('dirs') == 1:
            del weed[weed.index('name')]
        if extra == None:
            extra = []
        if not isinstance(extra, (list, tuple)):
            raise ValueError, "extra argument is not a list: %s" % extra
        for item in extra:
            if item not in valid_extras:
                raise HstatsError, "%s is not a valid extra keyword." % item
            if extra.count(item) > 1:
                raise HstatsError, "%s keyword can only appear once." % item

        data = []
        dd = []
        for name, other in self._data.iteritems():
            row = []
            if 'call' not in weed:
                row.append(other[_SDATA_CALL])
                if 'call' not in dd:
                    dd.append('call')
            if 'time' not in weed:
                row.append(other[_SDATA_TIME])
                if 'time' not in dd:
                    dd.append('time')
            if 'avgtime' not in weed:
                row.append(other[_SDATA_TIME]/other[_SDATA_CALL][0])
                if 'avgtime' not in dd:
                    dd.append('avgtime')
            if 'cumtime' not in weed:
                row.append(other[_SDATA_CUMTIME])
                if 'cumtime' not in dd:
                    dd.append('cumtime')
            if 'avgcumtime' not in weed:
                row.append(other[_SDATA_CUMTIME]/other[_SDATA_CALL][0])
                if 'avgcumtime' not in dd:
                    dd.append('avgcumtime')
            if 'dirs' in weed:
                path = name[0]
                newname = (os.path.basename(path), name[1], name[2])
                row.append(newname)
                if 'name' not in dd:
                    dd.append('name')
            if 'dirs' not in weed and 'name' not in weed:
                row.append(name)
                if 'name' not in dd:
                    dd.append('name')
            if 'parents' in extra:
                row.append(other[_SDATA_PARENTS])
                if 'parents' not in dd:
                    dd.append('parents')
            data.append(row)
        return data, dd
    
    def sort_data(self, data, order):
        """Sort the data according to criteria.

        order - On which fields in data to sort on.  This is a list of
          pairs (tuples).  The first number of a pair is the column to
          sort on, the second is either an 'a' or a 'd' to specify to
          sort ascending or descending.

        The sorted data is returned.
        """
        order
        global mycmp_indices
        global mycmp_directions
        mycmp_indices = [i[0] for i in order]
        mycmp_directions = [i[1] for i in order]
        for letter in mycmp_directions:
            if letter != 'a' and letter != 'd':
                raise HstatsError, "got '%s' for order dirction, " \
                      "expect 'a' or 'd'." % letter
        def mycmp(x, y):
            for i, d in zip(mycmp_indices, mycmp_directions):
                if x[i] < y[i] and d == 'd':
                    return 1
                if x[i] < y[i] and d == 'a':
                    return -1
                if x[i] > y[i] and d == 'd':
                    return -1
                if x[i] > y[i] and d == 'a':
                    return 1
            return 0
        data.sort(mycmp)
        return data

    def print_data(self, data, dd,
                   headers=True,
                   order=None,
                   limit=20,
                   fd=sys.stdout):
        """Print out the data to stdout.

        data - Data to print, received from .get_data()

        dd - Data Description as returned from .get_data()

        order - Order in which columns are printed.  This is a list
          containing the same strings as 'dd' but ordered correctly.
          Exception is 'rcall'; this is not allowed in this argument
          as it is always concatenated with 'ncall'.  If 'None' a
          default order is used.

        headers - Can be True, False or a list of the headers to be
          shown.

        limit - The number of entries or rows to show.  Set to '0' for
          all.

        fd - File object that will be written too.  By default output
          is written to standard out.

        Alignment is not configurable.  Numbers are right aligned,
        text is left aligned.  Also note that 'ncall' and 'rcall' can
        not be separated.
        """
        # Limit data to work on.
        if limit != 0:
            data = data[:limit]
        # Checks.
        if order == None:
            order = ['call', 'time', 'avgtime',
                     'cumtime', 'avgcumtime', 'name']
        for thing in order:
            if thing not in dd:
                raise HstatsError, "asked to print %s which is not " \
                      "available in the data given." % thing
        # Determine max column width.
        col_width = [6 for i in dd]
        name_index = None
        call_index = None
        if 'name' in dd:
            name_index = dd.index('name')
        if 'call' in dd:
            call_index = dd.index('call')
        for row in data:
            for i in range(len(row)):
                width = len(str(row[i])) + 1
                if i == call_index:
                    if row[i][1] == 0:
                        width -= 5
                    else:
                        width -= 3
                if i == name_index:
                    width -= 7
                if width > col_width[i]:
                    col_width[i] = width
        dd_to_width = dict(zip(dd, col_width))
        # Print headers.
        if headers == True:
            header_map = {'call': 'calls',
                          'time': 'tott',
                          'avgtime': 'avgt',
                          'cumtime': 'cumt',
                          'avgcumtime': 'avgct',
                          'name': 'name'}
            for thing in order:
                if thing == 'name':
                    text = '  ' + header_map[thing]
                    fd.write(text.ljust(dd_to_width[thing]))
                else:
                    fd.write(header_map[thing].rjust(dd_to_width[thing]))
            fd.write('\n')
        # Print out the results.
        for row in data:
            for thing in order:
                if thing == 'name':
                    text = '  ' + row[dd.index(thing)][0] \
                           + ':' + str(row[dd.index(thing)][1]) \
                           + ':' + row[dd.index(thing)][2]
                    fd.write(text.ljust(dd_to_width[thing]))
                elif thing == 'call':
                    text = str(row[dd.index(thing)][0])
                    if row[dd.index(thing)][1] > 0:
                        text += '/' + str(row[dd.index(thing)][1])
                    fd.write(text.rjust(dd_to_width[thing]))
                else:
                    text = str(row[dd.index(thing)])
                    fd.write(text.rjust(dd_to_width[thing]))
            fd.write('\n')

    def get_info(self):
        """Return the dictionarry of info added in the profiler file."""
        return self._reader._info
