acgc.stats.bivariate

Bivariate statistics

Statistical measures of relationships between two populations

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Bivariate statistics
  4
  5Statistical measures of relationships between two populations
  6"""
  7
  8import numpy as np
  9from scipy import stats
 10from .bivariate_lines import sen, sma, bivariate_line_equation
 11# import xarray as xr
 12
 13__all__ = [
 14    "BivariateStatistics",
 15    "nmb",
 16    "nmae",
 17    "nmbf",
 18    "nmaef"
 19]
 20
 21def nmb( x0, x1 ):
 22    '''Compute Normalized Mean Bias (NMB)
 23
 24    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
 25
 26    Parameters
 27    ----------
 28    x0 : array_like
 29        reference values
 30    x1 : array_like
 31        experiment values
 32    '''
 33
 34    assert (len(x0) == len(x1)), \
 35        "Parameters x0 and x1 must have the same length"
 36
 37    # Mean values
 38    x0_mean = np.mean(x0)
 39    x1_mean = np.mean(x1)
 40
 41    # Metric value
 42    return x1_mean / x0_mean - 1
 43
 44def nmae( x0, x1 ):
 45    '''Compute Normalized Mean Absolute Error (NMAE)
 46
 47    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
 48
 49    Parameters
 50    ---------
 51    x0 : array_like
 52        reference values
 53    x1 : array_like
 54        experiment values
 55    '''
 56
 57     # Mean values
 58    x0_mean = np.mean(x0)
 59
 60    # Mean absolute difference
 61    abs_diff = np.mean( np.abs(x1 - x0) )
 62
 63    # Metric value
 64    return abs_diff / np.abs( x0_mean )
 65
 66
 67def nmbf( x0, x1 ):
 68    '''Compute Normalized Mean Bias Factor (NMBF)
 69
 70    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
 71
 72    Parameters
 73    ----------
 74    x0 : array_like
 75        reference values
 76    x1 : array_like
 77        experiment values
 78    '''
 79
 80    # Ensure that arguments have the same length
 81    assert (len(x0) == len(x1)), \
 82        "Parameters x0 and x1 must have the same length"
 83
 84    # Mean values
 85    x0_mean = np.mean(x0)
 86    x1_mean = np.mean(x1)
 87
 88    # Metric value
 89    if x1_mean >= x0_mean:
 90        result = x1_mean / x0_mean - 1
 91    else:
 92        result= 1 - x0_mean / x1_mean
 93    # Equivalent (faster?) implementation
 94    #S = (mMean - oMean) / np.abs(mMean - oMean)
 95    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
 96
 97    return result
 98
 99def nmaef( x0, x1 ):
100    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
101
102    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
103    
104    Parameters
105    ----------
106    x0 : array_like
107        reference values
108    x1 : array_like
109        experiment values
110    '''
111
112    # Ensure that arguments have the same length
113    assert (len(x0) == len(x1)), \
114        "Parameters x0 and x1 must have the same length"
115
116    # Mean values
117    x0_mean = np.mean(x0)
118    x1_mean = np.mean(x1)
119
120    # Mean absolute difference
121    abs_diff = np.mean( np.abs(x1 - x0))
122
123    # Metric value
124    if x1_mean >= x0_mean:
125        result = abs_diff / x0_mean 
126    else:
127        result = abs_diff / x1_mean
128    # Equivalent (faster?) implementation
129    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
130    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
131
132    return result
133
134def _texify_name(name):
135    '''Return a LaTex formatted string for some variables
136    
137    Parameters
138    ----------
139    name : str
140    
141    Returns
142    -------
143    pretty_name : str
144    '''
145    if name.lower()=='n':
146        pretty_name = r'$n$'
147    elif name=='R2':
148        pretty_name = f'$R^2$'
149    elif name=='r2':
150        pretty_name = f'$r^2$'
151    elif name.lower()=='y_ols':
152        pretty_name = r'$y_{\rm OLS}$'
153    elif name.lower()=='y_sma':
154        pretty_name = r'$y_{\rm SMA}$'
155    elif name.lower()=='y_sen':
156        pretty_name = r'$y_{\rm Sen}$'
157    else:
158        pretty_name = name
159    return pretty_name
160
161def _number2str(value,
162                intformat='{:d}',
163                floatformat='{:.4f}'):
164    '''Format number as string using integer and float format specifiers
165    
166    Parameters
167    ----------
168    value : numeric, str
169        value to be converted
170    intformat : str, default='{:d}'
171        format specifier for integer types
172    floatformat : str, default='{:.4f}'
173        format specifier for float types
174
175    Returns
176    -------
177    str
178    '''
179    if isinstance(value,str):
180        pass
181    elif isinstance(value,(int,np.integer)):
182        value = intformat.format(value)
183    else:
184        value = floatformat.format(value)
185    return value
186
187class BivariateStatistics:
188    '''A suite of common statistics to quantify bivariate relationships
189
190    Class method 'summary' provides a formatted summary of these statistics
191    
192    Attributes
193    ----------
194    count, n : int
195        number of valid (not NaN) data value pairs
196    xmean, ymean : float
197        mean of x and y variables
198    xmedian, ymedian :float
199        median of x and y variables
200    xstd, ystd : float
201        standard deviation of x and y variables
202    mean_difference, md : float
203        ymean - xmean
204    std_difference, stdd : float
205        std( y - x )
206    mean_absolute_difference, mad : float
207        mean( |y-x| )
208    relative_mean_difference, rmd : float
209        md / xmean
210    relative_mean_absolute_difference, rmad :float
211        mad / xmean
212    standardized_mean_difference, smd : float
213        md / xstd
214    standardized_mean_absolute_difference, smad : float
215        mad /xstd
216    mean_relative_difference, mrd : float
217        mean(y/x) - 1
218    mean_absolute_relative_difference, mrd : float
219        mean(abs(y/x - 1))
220    mean_log10_ratio, mlr : float
221        mean( log10(y/x) )
222    std_log10_ratio, stdlr : float
223        std( log10(y/x) )
224    mean_absolute_log10_ratio, malr : float
225        mean( abs( log10(y/x) ) )
226    median_difference, medd : float
227        median(y-x)
228    median_absolute_difference, medad : float
229        median(|y-x|)
230    relative_median_difference, rmedd : float
231        median(y-x) / xmedian
232    relative_median_absolute_difference, rmedad : float
233        median(|y-x|) / xmedian
234    median_relative_difference, medianrd, medrd : float
235        median(y/x)-1
236    median_log10_ratio, medlr : float
237        median( log10(y/x) )
238    median_absolute_log10_ratio, medalr : float
239        median( abs( log10(y/x) ) )
240    normalized_mean_bias_factor, nmbf : float
241        see `nmbf` 
242    normalized_mean_absolute_error_factor, nmaef : float
243        see `nmaef`
244    root_mean_square_difference, rmsd : float
245        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
246    root_mean_square_log10_ratio, rmslr : float
247        $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$
248    covariance : float
249        cov(x,y)
250    correlation_pearson, correlation, pearsonr, R, r : float
251        Pearson linear correlation coefficient 
252    correlation_spearman, spearmanr : float
253        Spearman, non-parametric rank correlation coefficient
254    R2, r2 : float
255        Linear coefficient of determination, $R^2$
256    '''
257
258    def __init__(self,x,y,w=None,dropna=False,data=None):
259        '''Compute suite of bivariate statistics during initialization
260        
261        Statistic values are saved in attributes.
262        CAUTION: Weights w are ignored except in SMA fit
263
264        Parameters
265        ----------
266        x : ndarray or str
267            independent variable values
268        y : ndarray or str
269            dependent variable values, same size as x
270        w : ndarray or str, optional
271            weights for points (x,y), same size as x and y
272        dropna : bool, optional (default=False)
273            drops NaN values from x, y, and w
274        data : dict-like, optional
275            if x, y, or w are str, then they should be keys in data
276        '''
277
278        # Get values from data if needed
279        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
280            raise ValueError( 'Data argument must be used if x, y, or w is a string')
281        if isinstance(x,str):
282            x = data[x]
283        if isinstance(y,str):
284            y = data[y]
285        if isinstance(w,str):
286            w = data[w]
287
288        #Ensure that x and y have same length
289        if len(x) != len(y):
290            raise ValueError( 'Arguments x and y must have the same length' )
291        if w is None:
292            w = np.ones_like(x)
293        if len(w) != len(x):
294            raise ValueError( 'Argument w (if present) must have the same length as x' )
295
296        # Drop NaN values
297        if dropna:
298            isna = np.isnan(x*y*w)
299            x = x[~isna]
300            y = y[~isna]
301            w = w[~isna]
302
303        # Differences and ratios used repeatedly
304        diff = y - x
305        absdiff = np.abs( y - x )
306        # Ignore divide by zero and 0/0 while dividing
307        old_settings = np.seterr(divide='ignore',invalid='ignore')
308        ratio = y/x
309        log10ratio = np.log10(ratio)
310        np.seterr(**old_settings)
311
312        # Number of data points
313        self.count = self.n = len(x)
314
315        # Means, medians, and standard deviations
316        self.xmean = np.mean(x)
317        self.ymean = np.mean(y)
318        self.xmedian = np.median(x)
319        self.ymedian = np.median(y)
320        self.xstd   = np.std(x)
321        self.ystd   = np.std(y)
322
323        # Save values for use later
324        self._x = x
325        self._y = y
326        self._w = w
327
328        # Mean and mean absolute differences
329        self.mean_difference            = self.md   = self.ymean - self.xmean
330        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
331        self.std_difference             = self.stdd = np.std( diff )
332
333        # Relative and standardized differences
334        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
335        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
336        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
337        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
338
339        # Mean and median relative differences
340        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
341        self.mean_absolute_relative_difference   = self.mard  = np.mean( np.abs( ratio - 1 ) )
342        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
343        self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) )
344
345        # Median and median absolute differences
346        self.median_difference          = self.medd  = np.median( diff )
347        self.median_absolute_difference = self.medad = np.median( absdiff )
348
349        # Relative median differences
350        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
351        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
352
353        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
354        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
355
356        # Mean and mean absolute log ratio
357        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
358        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
359        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
360
361        # Median and median absolute log ratio
362        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
363        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
364
365        # RMS difference
366        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
367        # RMS log ratio
368        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
369
370        # Covariance, correlation
371        self.covariance = self.cov = np.cov(x,y)[0][1]
372        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
373            np.corrcoef(x,y)[0][1]
374        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
375        self.R2 = self.r2 = self.R**2
376
377    def __getitem__(self,key):
378        '''Accesses attribute values via object['key']'''
379        return getattr(self,key)
380
381    def fitline(self,method='sma',intercept=True,**kwargs):
382        '''Compute bivariate line fit
383        
384        Parameters
385        ----------
386        method : str
387            line fitting method: sma (default), ols, wls, York, sen, siegel
388        intercept : bool
389            defines whether non-zero intercept should be fitted
390        **kwargs 
391            passed to `acgc.stats.sma` (e.g. robust=True)
392
393        Returns
394        -------
395        result : dict
396            dictionary with keys:
397            - slope (float)
398                slope of fitted line
399            - intercept (float)
400                intercept of fitted line
401            - fittedvalues (array (N,))
402                values on fit line
403            - residuals (array (N,))
404                residual from fit line
405        '''
406
407        fitintercept = intercept
408
409        if method.lower()=='sma':
410            fit = sma(  self._x,
411                        self._y,
412                        self._w,
413                        intercept=fitintercept,
414                        **kwargs)
415            slope = fit['slope']
416            intercept= fit['intercept']
417
418        elif method.lower()=='ols':
419            if fitintercept:
420                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
421                                      self._y, rcond=None )
422            else:
423                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
424            slope = ols[0][0]
425            intercept = ols[0][1]
426
427        elif method.lower() in ['theil','sen','theilsen']:
428            fitintercept = True
429            fit = sen( self._x,
430                       self._y,
431                       **kwargs)
432            slope = fit['slope']
433            intercept = fit['intercept']
434
435        elif method.lower()=='siegel':
436            fitintercept = True
437            siegel = stats.siegelslopes( self._x,
438                                         self._y )
439            slope = siegel.slope
440            intercept = siegel.intercept
441
442        elif method.lower()=='wls':
443            raise NotImplementedError('WLS regression not implemented yet')
444
445        elif method.lower()=='york':
446            raise NotImplementedError('York regression not implemented yet')
447
448        else:
449            raise ValueError('Undefined method '+method)
450
451        line = dict( slope          = slope,
452                     intercept      = intercept,
453                     fittedvalues   = slope * self._x + intercept,
454                     residuals      = self._y - ( slope * self._x + intercept ),
455                     method         = method,
456                     fitintercept   = fitintercept )
457
458        return line
459
460    def slope(self,method='sma',intercept=True,**kwargs):
461        '''Compute slope of bivariate line fit
462        
463        Parameters
464        ----------
465        method : str
466            line fitting method: sma (default), ols, wls
467        intercept : bool
468            defines whether non-zero intercept should be fitted
469        **kwargs 
470            passed to `fitline`
471
472        Returns
473        -------
474        slope : float
475            value of y intercept
476        '''
477        return self.fitline(method,intercept,**kwargs)['slope']
478
479    def intercept(self,method='sma',intercept=True,**kwargs):
480        '''Compute intercept of bivariate line fit
481        
482        Parameters
483        ----------
484        method : str
485            line fitting method: sma (default) or ols
486        intercept : bool
487            defines whether non-zero intercept should be fitted
488        **kwargs 
489            passed to `fitline`
490
491        Returns
492        -------
493        intercept : float
494            value of y intercept
495        '''
496        return self.fitline(method,intercept,**kwargs)['intercept']
497
498    def _expand_variables(self,variables):
499        '''Expand special strings into a list of variables
500        
501        Parameter
502        ---------
503        variables : list or str, default='common'
504            Special strings ("all","common") will be expanded to a list of variables
505            list arguments will not be modified
506
507        Returns
508        -------
509        list 
510            variable names
511        '''
512        if variables is None:
513            variables='common'
514        if variables=='all':
515            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
516                       'MLR','MALR',
517                       'MedD','MedAD','RMedD','RMedAD','MedRD',
518                       'MedLR','MedALR',
519                       'NMBF','NMAEF','RMSD','cov',
520                       'R','R2','spearmanr','slope','intercept',
521                       'fitline','n']
522        elif variables=='common':
523            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
524        if not isinstance(variables,list):
525            raise ValueError(
526                'variables must be a list, None, or one of these strings: "all","common"')
527
528        return variables
529
530    def _summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
531        '''Summarize bivariate statistics into a dict
532
533        Parameters
534        ----------
535        vars : list or str, default='common'
536            names of attribute variables to include in summary
537            names are case insensitive            
538            The following strings are also accepted in place of a list 
539                "all" (displays all variables)
540                "common" (displays all measures of mean difference)
541        fitline_kw : dict, default=None
542            keywords passed to `fitline`
543        floatformat_fiteqn : str, default=floatformat
544            format specifier for slope and intercept (a,b) in y = a x + b
545        
546        Returns
547        -------
548        summary : dict
549            names and values of variables
550        '''
551
552        # List of variables
553        variables = self._expand_variables(variables)
554
555        if fitline_kw is None:
556            fitline_kw = {'method':'sma',
557                          'intercept':True}
558
559        # Construct the dict
560        summary = {}
561        for v in variables:
562            if v in ['slope','intercept']:
563                # These variables are object methods
564                func = getattr(self,v)
565                value = func(**fitline_kw)
566            elif v == 'fitline':
567                line = self.fitline(**fitline_kw)
568                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
569            else:
570                # Retrieve values
571                value = getattr(self,v.lower())
572
573            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
574            summary[v] = value
575
576        return summary
577
578    def summary(self, variables=None, fitline_kw=None,
579                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
580                stringlength=None ):
581        '''Summarize bivariate statistics
582
583        Parameters
584        ----------
585        vars : list or str, default='common'
586            names of attribute variables to include in summary
587            names are case insensitive            
588            The following strings are also accepted in place of a list 
589                "all" (displays all variables)
590                "common" (displays all measures of mean difference)
591        fitline_kw : dict, default=None
592            keywords passed to `fitline`
593        intformat : str, default='{:d}'
594            format specifier for integer values
595        floatformat : str, default='{:.4f}'
596            format specifier for floating point values
597        floatformat_fiteqn : str, default=floatformat
598            format specifier for slope and intercept (a,b) in y = a x + b
599        stringlength : int, default=None
600            length of the variables on output
601            default (None) is to use the length of the longest variable name
602        
603        Returns
604        -------
605        summary : str
606            names and values of variables
607        '''
608        # List of variables
609        variables = self._expand_variables(variables)
610
611        if floatformat_fiteqn is None:
612            floatformat_fiteqn = floatformat
613        if stringlength is None:
614            stringlength = np.max([len(v) for v in variables])
615        stringformat = '{:'+str(stringlength)+'s}'
616
617        # Get a dict containing the needed variables
618        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
619
620        # Extract length of the float numbers from floatformat
621        # import re
622        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
623        #       floatformat )[0] ) ).astype(int)
624
625        # summary = (stringformat+'{:>10s}').format('Variable','Value')
626        summarytext = ''
627        for k,v in summarydict.items():
628            vstr = _number2str(v,intformat,floatformat)
629            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
630
631        return summarytext
632
633    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
634                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
635                          loc=None, loc_units='axes',
636                          **kwargs):
637        '''Display bivariate statistics as a table inset on a plot axis
638
639        Parameters
640        ----------
641        ax : matplotlib.Figure.Axis 
642            axis where the table will be displayed
643        variables : list or str, default='common'
644            names of attribute variables to include in summary
645            names are case insensitive            
646            The following strings are also accepted in place of a list 
647                "all" (displays all variables)
648                "common" (displays all measures of mean difference)
649        fitline_kw : dict, default=None
650            keywords passed to `fitline`
651        intformat : str, default='{:d}'
652            format specifier for integer values
653        floatformat : str, default='{:.3f}'
654            format specifier for floating point values
655        floatformat_fiteqn : str, default=floatformat
656            format specifier for slope and intercept (a,b) in y = a x + b
657        loc : tuple (x0,y0), default=(0.85, 0.05)
658            location on the axis where the table will be drawn
659            can be in data units or axes units [0-1]
660        loc_units : {'axes' (default), 'data'}
661            specifies whether loc has 'data' units or 'axes' units [0-1]
662                    
663        Returns
664        -------
665        text1, text2 : matplotlib text object
666            Artist for the two text boxes        
667        '''
668        # List of variables
669        variables = self._expand_variables(variables)
670
671        if floatformat_fiteqn is None:
672            floatformat_fiteqn = floatformat
673
674        # Default location in lower right corner
675        if loc is None:
676            loc = (0.8,0.05)
677
678        # Coordinates for loc
679        if loc_units.lower()=='data':
680            coord=ax.transData
681        elif loc_units.lower() in ['axes','axis']:
682            coord=ax.transAxes
683        else:
684            raise ValueError('Display units should be "Data" or "Axes"')
685
686        # Get a dict containing the needed variables
687        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
688
689        # Column of label text
690        label_text = '\n'.join([_texify_name(key)
691                                for key in summarydict])
692        # Column of value text
693        value_text = '\n'.join([_number2str(v,intformat,floatformat)
694                                for v in summarydict.values()])
695
696        # Check if horizontal alignment keyword is used
697        ha=''
698        try:
699            ha = kwargs['ha']
700        except KeyError:
701            pass
702        try:
703            ha = kwargs['horizontalalignment']
704        except KeyError:
705            pass
706
707        # For right alignment, align on values first
708        # Otherwise, align on labels
709        if ha=='right':
710            first_text = value_text
711            second_text = label_text
712            sign = -1
713        else:
714            first_text = label_text
715            second_text = value_text
716            sign = +1
717
718        # Add first column of text
719        t1=ax.text(loc[0],loc[1],
720                first_text,
721                transform=coord,
722                **kwargs
723                )
724
725        # Get width of first text column
726        bbox = t1.get_window_extent().transformed(coord.inverted())
727        width = bbox.x1-bbox.x0
728
729        # Add second column of text
730        t2 = ax.text(loc[0]+width*1.05*sign,loc[1],
731                     second_text,
732                     transform=coord,
733                     **kwargs
734                     )
735
736        ##################################
737        # Early version of this function using matplotlib.table.table()
738
739        # if isinstance(loc,(tuple,list)):
740        #     # Create an inset axis to contain the table
741        #     tableaxis = ax.inset_axes(loc)
742        #     table_width=1
743        # else:
744        #     tableaxis = ax
745
746        # # Display the table on the axis
747        # return mtable.table(
748        #     tableaxis,
749        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
750        #     rowLabels=[texify_name(key) for key in summarydict],
751        #     colWidths=[table_width/2]*2,
752        #     edges=edges,
753        #     loc=loc, bbox=bbox
754        #     )
755
756        return [t1,t2]
class BivariateStatistics:
188class BivariateStatistics:
189    '''A suite of common statistics to quantify bivariate relationships
190
191    Class method 'summary' provides a formatted summary of these statistics
192    
193    Attributes
194    ----------
195    count, n : int
196        number of valid (not NaN) data value pairs
197    xmean, ymean : float
198        mean of x and y variables
199    xmedian, ymedian :float
200        median of x and y variables
201    xstd, ystd : float
202        standard deviation of x and y variables
203    mean_difference, md : float
204        ymean - xmean
205    std_difference, stdd : float
206        std( y - x )
207    mean_absolute_difference, mad : float
208        mean( |y-x| )
209    relative_mean_difference, rmd : float
210        md / xmean
211    relative_mean_absolute_difference, rmad :float
212        mad / xmean
213    standardized_mean_difference, smd : float
214        md / xstd
215    standardized_mean_absolute_difference, smad : float
216        mad /xstd
217    mean_relative_difference, mrd : float
218        mean(y/x) - 1
219    mean_absolute_relative_difference, mrd : float
220        mean(abs(y/x - 1))
221    mean_log10_ratio, mlr : float
222        mean( log10(y/x) )
223    std_log10_ratio, stdlr : float
224        std( log10(y/x) )
225    mean_absolute_log10_ratio, malr : float
226        mean( abs( log10(y/x) ) )
227    median_difference, medd : float
228        median(y-x)
229    median_absolute_difference, medad : float
230        median(|y-x|)
231    relative_median_difference, rmedd : float
232        median(y-x) / xmedian
233    relative_median_absolute_difference, rmedad : float
234        median(|y-x|) / xmedian
235    median_relative_difference, medianrd, medrd : float
236        median(y/x)-1
237    median_log10_ratio, medlr : float
238        median( log10(y/x) )
239    median_absolute_log10_ratio, medalr : float
240        median( abs( log10(y/x) ) )
241    normalized_mean_bias_factor, nmbf : float
242        see `nmbf` 
243    normalized_mean_absolute_error_factor, nmaef : float
244        see `nmaef`
245    root_mean_square_difference, rmsd : float
246        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
247    root_mean_square_log10_ratio, rmslr : float
248        $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$
249    covariance : float
250        cov(x,y)
251    correlation_pearson, correlation, pearsonr, R, r : float
252        Pearson linear correlation coefficient 
253    correlation_spearman, spearmanr : float
254        Spearman, non-parametric rank correlation coefficient
255    R2, r2 : float
256        Linear coefficient of determination, $R^2$
257    '''
258
259    def __init__(self,x,y,w=None,dropna=False,data=None):
260        '''Compute suite of bivariate statistics during initialization
261        
262        Statistic values are saved in attributes.
263        CAUTION: Weights w are ignored except in SMA fit
264
265        Parameters
266        ----------
267        x : ndarray or str
268            independent variable values
269        y : ndarray or str
270            dependent variable values, same size as x
271        w : ndarray or str, optional
272            weights for points (x,y), same size as x and y
273        dropna : bool, optional (default=False)
274            drops NaN values from x, y, and w
275        data : dict-like, optional
276            if x, y, or w are str, then they should be keys in data
277        '''
278
279        # Get values from data if needed
280        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
281            raise ValueError( 'Data argument must be used if x, y, or w is a string')
282        if isinstance(x,str):
283            x = data[x]
284        if isinstance(y,str):
285            y = data[y]
286        if isinstance(w,str):
287            w = data[w]
288
289        #Ensure that x and y have same length
290        if len(x) != len(y):
291            raise ValueError( 'Arguments x and y must have the same length' )
292        if w is None:
293            w = np.ones_like(x)
294        if len(w) != len(x):
295            raise ValueError( 'Argument w (if present) must have the same length as x' )
296
297        # Drop NaN values
298        if dropna:
299            isna = np.isnan(x*y*w)
300            x = x[~isna]
301            y = y[~isna]
302            w = w[~isna]
303
304        # Differences and ratios used repeatedly
305        diff = y - x
306        absdiff = np.abs( y - x )
307        # Ignore divide by zero and 0/0 while dividing
308        old_settings = np.seterr(divide='ignore',invalid='ignore')
309        ratio = y/x
310        log10ratio = np.log10(ratio)
311        np.seterr(**old_settings)
312
313        # Number of data points
314        self.count = self.n = len(x)
315
316        # Means, medians, and standard deviations
317        self.xmean = np.mean(x)
318        self.ymean = np.mean(y)
319        self.xmedian = np.median(x)
320        self.ymedian = np.median(y)
321        self.xstd   = np.std(x)
322        self.ystd   = np.std(y)
323
324        # Save values for use later
325        self._x = x
326        self._y = y
327        self._w = w
328
329        # Mean and mean absolute differences
330        self.mean_difference            = self.md   = self.ymean - self.xmean
331        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
332        self.std_difference             = self.stdd = np.std( diff )
333
334        # Relative and standardized differences
335        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
336        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
337        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
338        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
339
340        # Mean and median relative differences
341        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
342        self.mean_absolute_relative_difference   = self.mard  = np.mean( np.abs( ratio - 1 ) )
343        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
344        self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) )
345
346        # Median and median absolute differences
347        self.median_difference          = self.medd  = np.median( diff )
348        self.median_absolute_difference = self.medad = np.median( absdiff )
349
350        # Relative median differences
351        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
352        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
353
354        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
355        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
356
357        # Mean and mean absolute log ratio
358        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
359        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
360        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
361
362        # Median and median absolute log ratio
363        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
364        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
365
366        # RMS difference
367        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
368        # RMS log ratio
369        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
370
371        # Covariance, correlation
372        self.covariance = self.cov = np.cov(x,y)[0][1]
373        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
374            np.corrcoef(x,y)[0][1]
375        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
376        self.R2 = self.r2 = self.R**2
377
378    def __getitem__(self,key):
379        '''Accesses attribute values via object['key']'''
380        return getattr(self,key)
381
382    def fitline(self,method='sma',intercept=True,**kwargs):
383        '''Compute bivariate line fit
384        
385        Parameters
386        ----------
387        method : str
388            line fitting method: sma (default), ols, wls, York, sen, siegel
389        intercept : bool
390            defines whether non-zero intercept should be fitted
391        **kwargs 
392            passed to `acgc.stats.sma` (e.g. robust=True)
393
394        Returns
395        -------
396        result : dict
397            dictionary with keys:
398            - slope (float)
399                slope of fitted line
400            - intercept (float)
401                intercept of fitted line
402            - fittedvalues (array (N,))
403                values on fit line
404            - residuals (array (N,))
405                residual from fit line
406        '''
407
408        fitintercept = intercept
409
410        if method.lower()=='sma':
411            fit = sma(  self._x,
412                        self._y,
413                        self._w,
414                        intercept=fitintercept,
415                        **kwargs)
416            slope = fit['slope']
417            intercept= fit['intercept']
418
419        elif method.lower()=='ols':
420            if fitintercept:
421                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
422                                      self._y, rcond=None )
423            else:
424                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
425            slope = ols[0][0]
426            intercept = ols[0][1]
427
428        elif method.lower() in ['theil','sen','theilsen']:
429            fitintercept = True
430            fit = sen( self._x,
431                       self._y,
432                       **kwargs)
433            slope = fit['slope']
434            intercept = fit['intercept']
435
436        elif method.lower()=='siegel':
437            fitintercept = True
438            siegel = stats.siegelslopes( self._x,
439                                         self._y )
440            slope = siegel.slope
441            intercept = siegel.intercept
442
443        elif method.lower()=='wls':
444            raise NotImplementedError('WLS regression not implemented yet')
445
446        elif method.lower()=='york':
447            raise NotImplementedError('York regression not implemented yet')
448
449        else:
450            raise ValueError('Undefined method '+method)
451
452        line = dict( slope          = slope,
453                     intercept      = intercept,
454                     fittedvalues   = slope * self._x + intercept,
455                     residuals      = self._y - ( slope * self._x + intercept ),
456                     method         = method,
457                     fitintercept   = fitintercept )
458
459        return line
460
461    def slope(self,method='sma',intercept=True,**kwargs):
462        '''Compute slope of bivariate line fit
463        
464        Parameters
465        ----------
466        method : str
467            line fitting method: sma (default), ols, wls
468        intercept : bool
469            defines whether non-zero intercept should be fitted
470        **kwargs 
471            passed to `fitline`
472
473        Returns
474        -------
475        slope : float
476            value of y intercept
477        '''
478        return self.fitline(method,intercept,**kwargs)['slope']
479
480    def intercept(self,method='sma',intercept=True,**kwargs):
481        '''Compute intercept of bivariate line fit
482        
483        Parameters
484        ----------
485        method : str
486            line fitting method: sma (default) or ols
487        intercept : bool
488            defines whether non-zero intercept should be fitted
489        **kwargs 
490            passed to `fitline`
491
492        Returns
493        -------
494        intercept : float
495            value of y intercept
496        '''
497        return self.fitline(method,intercept,**kwargs)['intercept']
498
499    def _expand_variables(self,variables):
500        '''Expand special strings into a list of variables
501        
502        Parameter
503        ---------
504        variables : list or str, default='common'
505            Special strings ("all","common") will be expanded to a list of variables
506            list arguments will not be modified
507
508        Returns
509        -------
510        list 
511            variable names
512        '''
513        if variables is None:
514            variables='common'
515        if variables=='all':
516            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
517                       'MLR','MALR',
518                       'MedD','MedAD','RMedD','RMedAD','MedRD',
519                       'MedLR','MedALR',
520                       'NMBF','NMAEF','RMSD','cov',
521                       'R','R2','spearmanr','slope','intercept',
522                       'fitline','n']
523        elif variables=='common':
524            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
525        if not isinstance(variables,list):
526            raise ValueError(
527                'variables must be a list, None, or one of these strings: "all","common"')
528
529        return variables
530
531    def _summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
532        '''Summarize bivariate statistics into a dict
533
534        Parameters
535        ----------
536        vars : list or str, default='common'
537            names of attribute variables to include in summary
538            names are case insensitive            
539            The following strings are also accepted in place of a list 
540                "all" (displays all variables)
541                "common" (displays all measures of mean difference)
542        fitline_kw : dict, default=None
543            keywords passed to `fitline`
544        floatformat_fiteqn : str, default=floatformat
545            format specifier for slope and intercept (a,b) in y = a x + b
546        
547        Returns
548        -------
549        summary : dict
550            names and values of variables
551        '''
552
553        # List of variables
554        variables = self._expand_variables(variables)
555
556        if fitline_kw is None:
557            fitline_kw = {'method':'sma',
558                          'intercept':True}
559
560        # Construct the dict
561        summary = {}
562        for v in variables:
563            if v in ['slope','intercept']:
564                # These variables are object methods
565                func = getattr(self,v)
566                value = func(**fitline_kw)
567            elif v == 'fitline':
568                line = self.fitline(**fitline_kw)
569                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
570            else:
571                # Retrieve values
572                value = getattr(self,v.lower())
573
574            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
575            summary[v] = value
576
577        return summary
578
579    def summary(self, variables=None, fitline_kw=None,
580                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
581                stringlength=None ):
582        '''Summarize bivariate statistics
583
584        Parameters
585        ----------
586        vars : list or str, default='common'
587            names of attribute variables to include in summary
588            names are case insensitive            
589            The following strings are also accepted in place of a list 
590                "all" (displays all variables)
591                "common" (displays all measures of mean difference)
592        fitline_kw : dict, default=None
593            keywords passed to `fitline`
594        intformat : str, default='{:d}'
595            format specifier for integer values
596        floatformat : str, default='{:.4f}'
597            format specifier for floating point values
598        floatformat_fiteqn : str, default=floatformat
599            format specifier for slope and intercept (a,b) in y = a x + b
600        stringlength : int, default=None
601            length of the variables on output
602            default (None) is to use the length of the longest variable name
603        
604        Returns
605        -------
606        summary : str
607            names and values of variables
608        '''
609        # List of variables
610        variables = self._expand_variables(variables)
611
612        if floatformat_fiteqn is None:
613            floatformat_fiteqn = floatformat
614        if stringlength is None:
615            stringlength = np.max([len(v) for v in variables])
616        stringformat = '{:'+str(stringlength)+'s}'
617
618        # Get a dict containing the needed variables
619        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
620
621        # Extract length of the float numbers from floatformat
622        # import re
623        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
624        #       floatformat )[0] ) ).astype(int)
625
626        # summary = (stringformat+'{:>10s}').format('Variable','Value')
627        summarytext = ''
628        for k,v in summarydict.items():
629            vstr = _number2str(v,intformat,floatformat)
630            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
631
632        return summarytext
633
634    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
635                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
636                          loc=None, loc_units='axes',
637                          **kwargs):
638        '''Display bivariate statistics as a table inset on a plot axis
639
640        Parameters
641        ----------
642        ax : matplotlib.Figure.Axis 
643            axis where the table will be displayed
644        variables : list or str, default='common'
645            names of attribute variables to include in summary
646            names are case insensitive            
647            The following strings are also accepted in place of a list 
648                "all" (displays all variables)
649                "common" (displays all measures of mean difference)
650        fitline_kw : dict, default=None
651            keywords passed to `fitline`
652        intformat : str, default='{:d}'
653            format specifier for integer values
654        floatformat : str, default='{:.3f}'
655            format specifier for floating point values
656        floatformat_fiteqn : str, default=floatformat
657            format specifier for slope and intercept (a,b) in y = a x + b
658        loc : tuple (x0,y0), default=(0.85, 0.05)
659            location on the axis where the table will be drawn
660            can be in data units or axes units [0-1]
661        loc_units : {'axes' (default), 'data'}
662            specifies whether loc has 'data' units or 'axes' units [0-1]
663                    
664        Returns
665        -------
666        text1, text2 : matplotlib text object
667            Artist for the two text boxes        
668        '''
669        # List of variables
670        variables = self._expand_variables(variables)
671
672        if floatformat_fiteqn is None:
673            floatformat_fiteqn = floatformat
674
675        # Default location in lower right corner
676        if loc is None:
677            loc = (0.8,0.05)
678
679        # Coordinates for loc
680        if loc_units.lower()=='data':
681            coord=ax.transData
682        elif loc_units.lower() in ['axes','axis']:
683            coord=ax.transAxes
684        else:
685            raise ValueError('Display units should be "Data" or "Axes"')
686
687        # Get a dict containing the needed variables
688        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
689
690        # Column of label text
691        label_text = '\n'.join([_texify_name(key)
692                                for key in summarydict])
693        # Column of value text
694        value_text = '\n'.join([_number2str(v,intformat,floatformat)
695                                for v in summarydict.values()])
696
697        # Check if horizontal alignment keyword is used
698        ha=''
699        try:
700            ha = kwargs['ha']
701        except KeyError:
702            pass
703        try:
704            ha = kwargs['horizontalalignment']
705        except KeyError:
706            pass
707
708        # For right alignment, align on values first
709        # Otherwise, align on labels
710        if ha=='right':
711            first_text = value_text
712            second_text = label_text
713            sign = -1
714        else:
715            first_text = label_text
716            second_text = value_text
717            sign = +1
718
719        # Add first column of text
720        t1=ax.text(loc[0],loc[1],
721                first_text,
722                transform=coord,
723                **kwargs
724                )
725
726        # Get width of first text column
727        bbox = t1.get_window_extent().transformed(coord.inverted())
728        width = bbox.x1-bbox.x0
729
730        # Add second column of text
731        t2 = ax.text(loc[0]+width*1.05*sign,loc[1],
732                     second_text,
733                     transform=coord,
734                     **kwargs
735                     )
736
737        ##################################
738        # Early version of this function using matplotlib.table.table()
739
740        # if isinstance(loc,(tuple,list)):
741        #     # Create an inset axis to contain the table
742        #     tableaxis = ax.inset_axes(loc)
743        #     table_width=1
744        # else:
745        #     tableaxis = ax
746
747        # # Display the table on the axis
748        # return mtable.table(
749        #     tableaxis,
750        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
751        #     rowLabels=[texify_name(key) for key in summarydict],
752        #     colWidths=[table_width/2]*2,
753        #     edges=edges,
754        #     loc=loc, bbox=bbox
755        #     )
756
757        return [t1,t2]

A suite of common statistics to quantify bivariate relationships

Class method 'summary' provides a formatted summary of these statistics

Attributes
  • count, n (int): number of valid (not NaN) data value pairs
  • xmean, ymean (float): mean of x and y variables
  • xmedian, ymedian (float): median of x and y variables
  • xstd, ystd (float): standard deviation of x and y variables
  • mean_difference, md (float): ymean - xmean
  • std_difference, stdd (float): std( y - x )
  • mean_absolute_difference, mad (float): mean( |y-x| )
  • relative_mean_difference, rmd (float): md / xmean
  • relative_mean_absolute_difference, rmad (float): mad / xmean
  • standardized_mean_difference, smd (float): md / xstd
  • standardized_mean_absolute_difference, smad (float): mad /xstd
  • mean_relative_difference, mrd (float): mean(y/x) - 1
  • mean_absolute_relative_difference, mrd (float): mean(abs(y/x - 1))
  • mean_log10_ratio, mlr (float): mean( log10(y/x) )
  • std_log10_ratio, stdlr (float): std( log10(y/x) )
  • mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
  • median_difference, medd (float): median(y-x)
  • median_absolute_difference, medad (float): median(|y-x|)
  • relative_median_difference, rmedd (float): median(y-x) / xmedian
  • relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
  • median_relative_difference, medianrd, medrd (float): median(y/x)-1
  • median_log10_ratio, medlr (float): median( log10(y/x) )
  • median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
  • normalized_mean_bias_factor, nmbf (float): see nmbf
  • normalized_mean_absolute_error_factor, nmaef (float): see nmaef
  • root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
  • root_mean_square_log10_ratio, rmslr (float): $\sqrt{ \langle \log_{10}(y/x)^2 \rangle }$
  • covariance (float): cov(x,y)
  • correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
  • correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
  • R2, r2 (float): Linear coefficient of determination, $R^2$
BivariateStatistics(x, y, w=None, dropna=False, data=None)
259    def __init__(self,x,y,w=None,dropna=False,data=None):
260        '''Compute suite of bivariate statistics during initialization
261        
262        Statistic values are saved in attributes.
263        CAUTION: Weights w are ignored except in SMA fit
264
265        Parameters
266        ----------
267        x : ndarray or str
268            independent variable values
269        y : ndarray or str
270            dependent variable values, same size as x
271        w : ndarray or str, optional
272            weights for points (x,y), same size as x and y
273        dropna : bool, optional (default=False)
274            drops NaN values from x, y, and w
275        data : dict-like, optional
276            if x, y, or w are str, then they should be keys in data
277        '''
278
279        # Get values from data if needed
280        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
281            raise ValueError( 'Data argument must be used if x, y, or w is a string')
282        if isinstance(x,str):
283            x = data[x]
284        if isinstance(y,str):
285            y = data[y]
286        if isinstance(w,str):
287            w = data[w]
288
289        #Ensure that x and y have same length
290        if len(x) != len(y):
291            raise ValueError( 'Arguments x and y must have the same length' )
292        if w is None:
293            w = np.ones_like(x)
294        if len(w) != len(x):
295            raise ValueError( 'Argument w (if present) must have the same length as x' )
296
297        # Drop NaN values
298        if dropna:
299            isna = np.isnan(x*y*w)
300            x = x[~isna]
301            y = y[~isna]
302            w = w[~isna]
303
304        # Differences and ratios used repeatedly
305        diff = y - x
306        absdiff = np.abs( y - x )
307        # Ignore divide by zero and 0/0 while dividing
308        old_settings = np.seterr(divide='ignore',invalid='ignore')
309        ratio = y/x
310        log10ratio = np.log10(ratio)
311        np.seterr(**old_settings)
312
313        # Number of data points
314        self.count = self.n = len(x)
315
316        # Means, medians, and standard deviations
317        self.xmean = np.mean(x)
318        self.ymean = np.mean(y)
319        self.xmedian = np.median(x)
320        self.ymedian = np.median(y)
321        self.xstd   = np.std(x)
322        self.ystd   = np.std(y)
323
324        # Save values for use later
325        self._x = x
326        self._y = y
327        self._w = w
328
329        # Mean and mean absolute differences
330        self.mean_difference            = self.md   = self.ymean - self.xmean
331        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
332        self.std_difference             = self.stdd = np.std( diff )
333
334        # Relative and standardized differences
335        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
336        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
337        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
338        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
339
340        # Mean and median relative differences
341        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
342        self.mean_absolute_relative_difference   = self.mard  = np.mean( np.abs( ratio - 1 ) )
343        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
344        self.median_absolute_relative_difference = self.medianard = self.medard = np.median( np.abs( ratio - 1 ) )
345
346        # Median and median absolute differences
347        self.median_difference          = self.medd  = np.median( diff )
348        self.median_absolute_difference = self.medad = np.median( absdiff )
349
350        # Relative median differences
351        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
352        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
353
354        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
355        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
356
357        # Mean and mean absolute log ratio
358        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
359        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
360        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
361
362        # Median and median absolute log ratio
363        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
364        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
365
366        # RMS difference
367        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
368        # RMS log ratio
369        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
370
371        # Covariance, correlation
372        self.covariance = self.cov = np.cov(x,y)[0][1]
373        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
374            np.corrcoef(x,y)[0][1]
375        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
376        self.R2 = self.r2 = self.R**2

Compute suite of bivariate statistics during initialization

Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit

Parameters
  • x (ndarray or str): independent variable values
  • y (ndarray or str): dependent variable values, same size as x
  • w (ndarray or str, optional): weights for points (x,y), same size as x and y
  • dropna (bool, optional (default=False)): drops NaN values from x, y, and w
  • data (dict-like, optional): if x, y, or w are str, then they should be keys in data
xmean
ymean
xmedian
ymedian
xstd
ystd
def fitline(self, method='sma', intercept=True, **kwargs):
382    def fitline(self,method='sma',intercept=True,**kwargs):
383        '''Compute bivariate line fit
384        
385        Parameters
386        ----------
387        method : str
388            line fitting method: sma (default), ols, wls, York, sen, siegel
389        intercept : bool
390            defines whether non-zero intercept should be fitted
391        **kwargs 
392            passed to `acgc.stats.sma` (e.g. robust=True)
393
394        Returns
395        -------
396        result : dict
397            dictionary with keys:
398            - slope (float)
399                slope of fitted line
400            - intercept (float)
401                intercept of fitted line
402            - fittedvalues (array (N,))
403                values on fit line
404            - residuals (array (N,))
405                residual from fit line
406        '''
407
408        fitintercept = intercept
409
410        if method.lower()=='sma':
411            fit = sma(  self._x,
412                        self._y,
413                        self._w,
414                        intercept=fitintercept,
415                        **kwargs)
416            slope = fit['slope']
417            intercept= fit['intercept']
418
419        elif method.lower()=='ols':
420            if fitintercept:
421                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
422                                      self._y, rcond=None )
423            else:
424                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
425            slope = ols[0][0]
426            intercept = ols[0][1]
427
428        elif method.lower() in ['theil','sen','theilsen']:
429            fitintercept = True
430            fit = sen( self._x,
431                       self._y,
432                       **kwargs)
433            slope = fit['slope']
434            intercept = fit['intercept']
435
436        elif method.lower()=='siegel':
437            fitintercept = True
438            siegel = stats.siegelslopes( self._x,
439                                         self._y )
440            slope = siegel.slope
441            intercept = siegel.intercept
442
443        elif method.lower()=='wls':
444            raise NotImplementedError('WLS regression not implemented yet')
445
446        elif method.lower()=='york':
447            raise NotImplementedError('York regression not implemented yet')
448
449        else:
450            raise ValueError('Undefined method '+method)
451
452        line = dict( slope          = slope,
453                     intercept      = intercept,
454                     fittedvalues   = slope * self._x + intercept,
455                     residuals      = self._y - ( slope * self._x + intercept ),
456                     method         = method,
457                     fitintercept   = fitintercept )
458
459        return line

Compute bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to acgc.stats.sma (e.g. robust=True)
Returns
  • result (dict): dictionary with keys:
    • slope (float) slope of fitted line
    • intercept (float) intercept of fitted line
    • fittedvalues (array (N,)) values on fit line
    • residuals (array (N,)) residual from fit line
def slope(self, method='sma', intercept=True, **kwargs):
461    def slope(self,method='sma',intercept=True,**kwargs):
462        '''Compute slope of bivariate line fit
463        
464        Parameters
465        ----------
466        method : str
467            line fitting method: sma (default), ols, wls
468        intercept : bool
469            defines whether non-zero intercept should be fitted
470        **kwargs 
471            passed to `fitline`
472
473        Returns
474        -------
475        slope : float
476            value of y intercept
477        '''
478        return self.fitline(method,intercept,**kwargs)['slope']

Compute slope of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • slope (float): value of y intercept
def intercept(self, method='sma', intercept=True, **kwargs):
480    def intercept(self,method='sma',intercept=True,**kwargs):
481        '''Compute intercept of bivariate line fit
482        
483        Parameters
484        ----------
485        method : str
486            line fitting method: sma (default) or ols
487        intercept : bool
488            defines whether non-zero intercept should be fitted
489        **kwargs 
490            passed to `fitline`
491
492        Returns
493        -------
494        intercept : float
495            value of y intercept
496        '''
497        return self.fitline(method,intercept,**kwargs)['intercept']

Compute intercept of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default) or ols
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • intercept (float): value of y intercept
def summary( self, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, stringlength=None):
579    def summary(self, variables=None, fitline_kw=None,
580                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
581                stringlength=None ):
582        '''Summarize bivariate statistics
583
584        Parameters
585        ----------
586        vars : list or str, default='common'
587            names of attribute variables to include in summary
588            names are case insensitive            
589            The following strings are also accepted in place of a list 
590                "all" (displays all variables)
591                "common" (displays all measures of mean difference)
592        fitline_kw : dict, default=None
593            keywords passed to `fitline`
594        intformat : str, default='{:d}'
595            format specifier for integer values
596        floatformat : str, default='{:.4f}'
597            format specifier for floating point values
598        floatformat_fiteqn : str, default=floatformat
599            format specifier for slope and intercept (a,b) in y = a x + b
600        stringlength : int, default=None
601            length of the variables on output
602            default (None) is to use the length of the longest variable name
603        
604        Returns
605        -------
606        summary : str
607            names and values of variables
608        '''
609        # List of variables
610        variables = self._expand_variables(variables)
611
612        if floatformat_fiteqn is None:
613            floatformat_fiteqn = floatformat
614        if stringlength is None:
615            stringlength = np.max([len(v) for v in variables])
616        stringformat = '{:'+str(stringlength)+'s}'
617
618        # Get a dict containing the needed variables
619        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
620
621        # Extract length of the float numbers from floatformat
622        # import re
623        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
624        #       floatformat )[0] ) ).astype(int)
625
626        # summary = (stringformat+'{:>10s}').format('Variable','Value')
627        summarytext = ''
628        for k,v in summarydict.items():
629            vstr = _number2str(v,intformat,floatformat)
630            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
631
632        return summarytext

Summarize bivariate statistics

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.4f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
  • summary (str): names and values of variables
def summary_fig_inset( self, ax, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, loc=None, loc_units='axes', **kwargs):
634    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
635                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
636                          loc=None, loc_units='axes',
637                          **kwargs):
638        '''Display bivariate statistics as a table inset on a plot axis
639
640        Parameters
641        ----------
642        ax : matplotlib.Figure.Axis 
643            axis where the table will be displayed
644        variables : list or str, default='common'
645            names of attribute variables to include in summary
646            names are case insensitive            
647            The following strings are also accepted in place of a list 
648                "all" (displays all variables)
649                "common" (displays all measures of mean difference)
650        fitline_kw : dict, default=None
651            keywords passed to `fitline`
652        intformat : str, default='{:d}'
653            format specifier for integer values
654        floatformat : str, default='{:.3f}'
655            format specifier for floating point values
656        floatformat_fiteqn : str, default=floatformat
657            format specifier for slope and intercept (a,b) in y = a x + b
658        loc : tuple (x0,y0), default=(0.85, 0.05)
659            location on the axis where the table will be drawn
660            can be in data units or axes units [0-1]
661        loc_units : {'axes' (default), 'data'}
662            specifies whether loc has 'data' units or 'axes' units [0-1]
663                    
664        Returns
665        -------
666        text1, text2 : matplotlib text object
667            Artist for the two text boxes        
668        '''
669        # List of variables
670        variables = self._expand_variables(variables)
671
672        if floatformat_fiteqn is None:
673            floatformat_fiteqn = floatformat
674
675        # Default location in lower right corner
676        if loc is None:
677            loc = (0.8,0.05)
678
679        # Coordinates for loc
680        if loc_units.lower()=='data':
681            coord=ax.transData
682        elif loc_units.lower() in ['axes','axis']:
683            coord=ax.transAxes
684        else:
685            raise ValueError('Display units should be "Data" or "Axes"')
686
687        # Get a dict containing the needed variables
688        summarydict = self._summary_dict( variables, fitline_kw, floatformat_fiteqn )
689
690        # Column of label text
691        label_text = '\n'.join([_texify_name(key)
692                                for key in summarydict])
693        # Column of value text
694        value_text = '\n'.join([_number2str(v,intformat,floatformat)
695                                for v in summarydict.values()])
696
697        # Check if horizontal alignment keyword is used
698        ha=''
699        try:
700            ha = kwargs['ha']
701        except KeyError:
702            pass
703        try:
704            ha = kwargs['horizontalalignment']
705        except KeyError:
706            pass
707
708        # For right alignment, align on values first
709        # Otherwise, align on labels
710        if ha=='right':
711            first_text = value_text
712            second_text = label_text
713            sign = -1
714        else:
715            first_text = label_text
716            second_text = value_text
717            sign = +1
718
719        # Add first column of text
720        t1=ax.text(loc[0],loc[1],
721                first_text,
722                transform=coord,
723                **kwargs
724                )
725
726        # Get width of first text column
727        bbox = t1.get_window_extent().transformed(coord.inverted())
728        width = bbox.x1-bbox.x0
729
730        # Add second column of text
731        t2 = ax.text(loc[0]+width*1.05*sign,loc[1],
732                     second_text,
733                     transform=coord,
734                     **kwargs
735                     )
736
737        ##################################
738        # Early version of this function using matplotlib.table.table()
739
740        # if isinstance(loc,(tuple,list)):
741        #     # Create an inset axis to contain the table
742        #     tableaxis = ax.inset_axes(loc)
743        #     table_width=1
744        # else:
745        #     tableaxis = ax
746
747        # # Display the table on the axis
748        # return mtable.table(
749        #     tableaxis,
750        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
751        #     rowLabels=[texify_name(key) for key in summarydict],
752        #     colWidths=[table_width/2]*2,
753        #     edges=edges,
754        #     loc=loc, bbox=bbox
755        #     )
756
757        return [t1,t2]

Display bivariate statistics as a table inset on a plot axis

Parameters
  • ax (matplotlib.Figure.Axis): axis where the table will be displayed
  • variables (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.3f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
  • loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
  • text1, text2 (matplotlib text object): Artist for the two text boxes
def nmb(x0, x1):
22def nmb( x0, x1 ):
23    '''Compute Normalized Mean Bias (NMB)
24
25    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
26
27    Parameters
28    ----------
29    x0 : array_like
30        reference values
31    x1 : array_like
32        experiment values
33    '''
34
35    assert (len(x0) == len(x1)), \
36        "Parameters x0 and x1 must have the same length"
37
38    # Mean values
39    x0_mean = np.mean(x0)
40    x1_mean = np.mean(x1)
41
42    # Metric value
43    return x1_mean / x0_mean - 1

Compute Normalized Mean Bias (NMB)

NMB = ( mean(x1) - mean(x0) ) / mean(x0)

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmae(x0, x1):
45def nmae( x0, x1 ):
46    '''Compute Normalized Mean Absolute Error (NMAE)
47
48    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
49
50    Parameters
51    ---------
52    x0 : array_like
53        reference values
54    x1 : array_like
55        experiment values
56    '''
57
58     # Mean values
59    x0_mean = np.mean(x0)
60
61    # Mean absolute difference
62    abs_diff = np.mean( np.abs(x1 - x0) )
63
64    # Metric value
65    return abs_diff / np.abs( x0_mean )

Compute Normalized Mean Absolute Error (NMAE)

NMAE = mean(abs(x1 - x0)) / abs(mean(x0))

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmbf(x0, x1):
68def nmbf( x0, x1 ):
69    '''Compute Normalized Mean Bias Factor (NMBF)
70
71    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
72
73    Parameters
74    ----------
75    x0 : array_like
76        reference values
77    x1 : array_like
78        experiment values
79    '''
80
81    # Ensure that arguments have the same length
82    assert (len(x0) == len(x1)), \
83        "Parameters x0 and x1 must have the same length"
84
85    # Mean values
86    x0_mean = np.mean(x0)
87    x1_mean = np.mean(x1)
88
89    # Metric value
90    if x1_mean >= x0_mean:
91        result = x1_mean / x0_mean - 1
92    else:
93        result= 1 - x0_mean / x1_mean
94    # Equivalent (faster?) implementation
95    #S = (mMean - oMean) / np.abs(mMean - oMean)
96    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
97
98    return result

Compute Normalized Mean Bias Factor (NMBF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmaef(x0, x1):
100def nmaef( x0, x1 ):
101    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
102
103    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
104    
105    Parameters
106    ----------
107    x0 : array_like
108        reference values
109    x1 : array_like
110        experiment values
111    '''
112
113    # Ensure that arguments have the same length
114    assert (len(x0) == len(x1)), \
115        "Parameters x0 and x1 must have the same length"
116
117    # Mean values
118    x0_mean = np.mean(x0)
119    x1_mean = np.mean(x1)
120
121    # Mean absolute difference
122    abs_diff = np.mean( np.abs(x1 - x0))
123
124    # Metric value
125    if x1_mean >= x0_mean:
126        result = abs_diff / x0_mean 
127    else:
128        result = abs_diff / x1_mean
129    # Equivalent (faster?) implementation
130    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
131    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
132
133    return result

Compute Normalized Mean Absolute Error Factor (NMAEF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values