#pval.pm #Copyright (C) 1998 Michael Levitt, Mark Gerstein, Cyrus Wilson #This file contains routines (1/4/98) for calculating P values for sequence and #structure alignments. The routines are those described in #Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920. #Gerstein, M. and Levitt, M. (1998) Protein Science 7, 445-456. #When using any of these routines please cite the above papers. #NB: The PNAS paper cited above contains an error on page 5917 in the values # of two parameters used in the calculation of the structural comparison # score p-value. The correct values are a = 171.8 and b = -419.3 # The calc_structal_p_value_NS routine in this package correctly calculates # this structural p-value. package pval; #------------------------ sub calc_sequence_p_value { #------------------------ # calculate the sequence P-value from the analytical expression # This is an extreme value distribution with # p(Z) = exp(-Z-exp(Z)) which equals 1/e at Z = 0 # P(Z) = int(p(Z)) = 1-1/exp(1/exp(Z)) # P(Z) = 1-exp(-1/exp(Z)) # P(Z) = 1-exp(-exp(-Z)) # Parameters N1: residues in sequence 1; N2: residues in sequence 2; # and SWS: Smith-Waterman score for the alignment. # Solve for Z in terms to P(Z) to give Z = -log(-log(P(Z))) local ( $N1, $N2, $SWS ) = @_; local ( @p1, $pol, $Z, $Mean_SWS, $SD_SWS, $lCpr, $Cpr ); $n12 = $N1 * $N2; $ln12 = log($n12); $SD_SWS = 5.843705; $Mean_SWS = $ln12 * 5.843705 - 26.287011; $Z = ( $SWS - $Mean_SWS ) / $SD_SWS; # P(Z) = 1-exp(-exp(-Z)) # Note: for large positive Z, exp(-Z) -> 0 and P(Z) -> 0 as exp(-Z) # for large negative Z, exp(-Z) -> big and P(Z) -> 1 if ( $Z > 20 ) { $Cpr = exp ( -$Z ) } else { $expZ = exp(-$Z); $Cpr = 1-exp ( -$expZ ); } if ( $Cpr <= 0 ) { $Cpr = 1e-200; } $z1_glob = $Z; return ( $Cpr ); } #--------------------------- sub calc_structal_p_value_NR { #--------------------------- # This used N and R as arguments # Calculate the structal P-value from the analytical expression # use a sixth order polymonial in two seperate segment to approximate # the cumulative distribution of exp(-Z^4) # Parameters N: number of residues aligned, and R, the RMS. local ( $N, $R ) = @_; local ( @p1, $pol, $Z, $Mean_lnR, $SD_lnR, $lCpr, $Cpr ); local ( $sum, $iz, $z, $sum ); local ( $lN ); # integrate the density numerically if ( !$z4_table ) { $z4_table = 1; $ln60_gl = log(60); @para_gl = ( 0.155057, -0.619306, 1.726955, 0.092226, 0.212374); $bpar_gl = 2.0*$ln60_gl*$para_gl[0] + $para_gl[1]; $apar_gl = ($ln60_gl**2)*$para_gl[0] + $ln60_gl*$para_gl[1] + $para_gl[2] - $bpar_gl*$ln60_gl; $sum = 0.0; $zmin = -5.0; $dz = 0.001; for ( $iz = 0; $iz <= 10000; $iz++ ) { $z = $iz * $dz + $zmin; $sum += $dz*exp( -$z**4 ); $CDF[$iz] = $sum; } for ( $iz = 0; $iz <= 10000; $iz++ ) { $CDF[$iz] /= $CDF[10000]; } } $lN = log($N); if ( $N < 60 ) { $Mean_lnR = $lN*$lN*$para_gl[0] + $lN*$para_gl[1] + $para_gl[2]; $SD_lnR = $lN*$para_gl[3] + $para_gl[4]; } else { $Mean_lnR = $lN*$bpar_gl + $apar_gl; $SD_lnR = $ln60_gl*$para_gl[3] + $para_gl[4]; } $Z = ( log($R) - $Mean_lnR ) / $SD_lnR; # Get integral from a table $iz = int ( ( $Z - $zmin ) / $dz ); if ( $iz < 0 ) { $iz = 0 }; if ( $iz > 10000 ) { $iz = 10000 } $Cpr = $CDF[$iz]; $z1_glob = $Z; return ( $Cpr ); } #--------------------------- sub calc_structal_p_value_NS { #--------------------------- # Calculate the structal P-value from the analytical expression # Parameters N: number of residues aligned, # and S: the structural comparison score (Sstr) local ( $N, $MS ) = @_; local ( $Z, $Mean_MS, $SD_MS, $Cpr, $expZ ); local ( $lN, $ln60, @para, $bpar, $apar ); $lN = log($N); $ln60 = log(120); @para = ( 18.411424, -4.501719, 2.637163, 21.351857, -37.521269); $bpar = 2.0*$ln60*$para[0] + $para[1]; $apar = ($ln60**2)*$para[0] + $ln60*$para[1] + $para[2] - $bpar*$ln60; # $bpar corresponds to the coefficient a on page 5917 in # Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920. # $apar corresponds to the coefficient b # Note that in the paper these values are reported incorrectly. The values # are switched and a negative sign is lost. According to the relationships # between a and b and the rest of the coefficients, the correct values are: # a = 171.8 and b = -419.3 # Calculating the values based on c, d, e, f, and g (as this routine does # above) will yield the correct values for a and b. if ( $N < 120 ) { $Mean_MS = $lN*$lN*$para[0] + $lN*$para[1] + $para[2]; $SD_MS = $lN*$para[3] + $para[4]; } else { $Mean_MS = $lN*$bpar + $apar; $SD_MS = $ln60*$para[3] + $para[4]; } $Z = ( $MS - $Mean_MS ) / $SD_MS; # P(Z) = 1-exp(-exp(-Z)) # Note: for large positive Z, exp(-Z) -> 0 and P(Z) -> 0 as exp(-Z) # for large negative Z, exp(-Z) -> big and P(Z) -> 1 if ( $Z > 20 ) { $Cpr = exp ( -$Z ) } else { $expZ = exp(-$Z); $Cpr = 1-exp ( -$expZ ); } $z1_glob = $Z; return ( $Cpr ); } return(1);