#pval.pm

#Copyright (C) 1998 Michael Levitt, Mark Gerstein, Cyrus Wilson

#This file contains routines (1/4/98) for calculating P values for sequence and
#structure alignments.  The routines are those described in
#Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920.
#Gerstein, M. and Levitt, M. (1998) Protein Science 7, 445-456.

#When using any of these routines please cite the above papers.

#NB: The PNAS paper cited above contains an error on page 5917 in the values
#    of two parameters used in the calculation of the structural comparison
#    score p-value. The correct values are a = 171.8 and b = -419.3
#    The calc_structal_p_value_NS routine in this package correctly calculates
#    this structural p-value.

package pval;


#------------------------
sub calc_sequence_p_value { 
#------------------------
 
# calculate the sequence P-value from the analytical expression
# This is an extreme value distribution with
# p(Z) = exp(-Z-exp(Z)) which equals 1/e at Z = 0
# P(Z) = int(p(Z)) = 1-1/exp(1/exp(Z))
# P(Z)             = 1-exp(-1/exp(Z))
# P(Z)             = 1-exp(-exp(-Z))

# Parameters N1: residues in sequence 1; N2: residues in sequence 2;
# and SWS: Smith-Waterman score for the alignment.

# Solve for Z in terms to P(Z) to give Z = -log(-log(P(Z)))

  local ( $N1, $N2, $SWS ) = @_;
  local ( @p1, $pol, $Z, $Mean_SWS, $SD_SWS, $lCpr, $Cpr );

  $n12 = $N1 * $N2;
  $ln12 = log($n12);

  $SD_SWS   = 5.843705;
  $Mean_SWS = $ln12 * 5.843705 - 26.287011;

  $Z = ( $SWS - $Mean_SWS ) / $SD_SWS;

# P(Z) = 1-exp(-exp(-Z))
# Note: for large positive Z, exp(-Z) -> 0   and P(Z) -> 0 as exp(-Z)
#       for large negative Z, exp(-Z) -> big and P(Z) -> 1

  if ( $Z > 20 ) { $Cpr = exp ( -$Z ) }
  else { 
    $expZ = exp(-$Z); $Cpr = 1-exp ( -$expZ );
  }

  if ( $Cpr <= 0 ) { $Cpr = 1e-200; }
  $z1_glob = $Z;


  return ( $Cpr );

}


#---------------------------
sub calc_structal_p_value_NR { 
#---------------------------

# This used N and R as arguments 

# Calculate the structal P-value from the analytical expression
# use a sixth order polymonial in two seperate segment to approximate
# the cumulative distribution of exp(-Z^4)

# Parameters N: number of residues aligned, and R, the RMS.

  local ( $N, $R ) = @_;
  local ( @p1, $pol, $Z, $Mean_lnR, $SD_lnR, $lCpr, $Cpr );
  local ( $sum, $iz, $z, $sum );
  local ( $lN );

# integrate the density numerically 

  if ( !$z4_table ) {
    $z4_table = 1;

    $ln60_gl = log(60);
    @para_gl = ( 0.155057, -0.619306, 1.726955, 0.092226, 0.212374);
    $bpar_gl = 2.0*$ln60_gl*$para_gl[0] + $para_gl[1]; 
    $apar_gl = ($ln60_gl**2)*$para_gl[0] + $ln60_gl*$para_gl[1] + $para_gl[2] - $bpar_gl*$ln60_gl;

    $sum = 0.0;
    $zmin = -5.0;
    $dz = 0.001;
    for ( $iz = 0; $iz <= 10000; $iz++ ) { 
      $z = $iz * $dz + $zmin;
      $sum += $dz*exp( -$z**4 );
      $CDF[$iz] = $sum;
    } 
    for ( $iz = 0; $iz <= 10000; $iz++ ) { 
      $CDF[$iz] /= $CDF[10000];
    } 
  }

  $lN = log($N);
  if ( $N < 60 ) { 
    $Mean_lnR = $lN*$lN*$para_gl[0] + $lN*$para_gl[1] + $para_gl[2];
    $SD_lnR   = $lN*$para_gl[3] + $para_gl[4];
  }
  else { 
    $Mean_lnR = $lN*$bpar_gl + $apar_gl;
    $SD_lnR   = $ln60_gl*$para_gl[3] + $para_gl[4];
  } 


  $Z = ( log($R) - $Mean_lnR ) / $SD_lnR;

# Get integral from a table

  $iz = int ( ( $Z - $zmin ) / $dz ); 
  if ( $iz < 0 ) { $iz = 0 }; if ( $iz > 10000 ) { $iz = 10000 }

  $Cpr = $CDF[$iz];


  $z1_glob = $Z;
  return ( $Cpr );

}


#---------------------------
sub calc_structal_p_value_NS { 
#---------------------------

# Calculate the structal P-value from the analytical expression

# Parameters N: number of residues aligned,
# and S: the structural comparison score (Sstr)

  local ( $N, $MS ) = @_;
  local ( $Z, $Mean_MS, $SD_MS, $Cpr, $expZ );
  local ( $lN, $ln60, @para, $bpar, $apar );

  
  $lN = log($N);
  $ln60 = log(120);
  @para = ( 18.411424, -4.501719, 2.637163, 21.351857, -37.521269);
  $bpar = 2.0*$ln60*$para[0] + $para[1]; 
  $apar = ($ln60**2)*$para[0] + $ln60*$para[1] + $para[2] - $bpar*$ln60;

# $bpar corresponds to the coefficient a on page 5917 in
# Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920.

# $apar corresponds to the coefficient b

# Note that in the paper these values are reported incorrectly. The values
# are switched and a negative sign is lost.  According to the relationships
# between a and b and the rest of the coefficients, the correct values are:
# a = 171.8 and b = -419.3
# Calculating the values based on c, d, e, f, and g (as this routine does
# above) will yield the correct values for a and b.

  if ( $N < 120 ) {
    $Mean_MS = $lN*$lN*$para[0] + $lN*$para[1] + $para[2];
    $SD_MS   = $lN*$para[3] + $para[4];
  }
  else { 
    $Mean_MS = $lN*$bpar + $apar;
    $SD_MS   = $ln60*$para[3] + $para[4];
  } 

  $Z = ( $MS - $Mean_MS ) / $SD_MS;

# P(Z) = 1-exp(-exp(-Z))
# Note: for large positive Z, exp(-Z) -> 0   and P(Z) -> 0 as exp(-Z)
#       for large negative Z, exp(-Z) -> big and P(Z) -> 1

  if ( $Z > 20 ) { $Cpr = exp ( -$Z ) }
  else { 
    $expZ = exp(-$Z); $Cpr = 1-exp ( -$expZ );
  }

  $z1_glob = $Z;
  return ( $Cpr );

}

return(1);