#pval.pm #Copyright (C) 1998 Michael Levitt, Mark Gerstein, Cyrus Wilson #This file contains routines (1/4/98) for calculating P values for sequence and #structure alignments. The routines are those described in #Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920. #Gerstein, M. and Levitt, M. (1998) Protein Science 7, 445-456. #When using any of these routines please cite the above papers. #NB: The PNAS paper cited above contains an error on page 5917 in the values # of two parameters used in the calculation of the structural comparison # score p-value. The correct values are a = 171.8 and b = -419.3 # The calc_structal_p_value_NS routine in this package correctly calculates # this structural p-value. package pval; #------------------------ sub calc_sequence_p_value { #------------------------ # calculate the sequence P-value from the analytical expression # This is an extreme value distribution with # p(Z) = exp(-Z-exp(Z)) which equals 1/e at Z = 0 # P(Z) = int(p(Z)) = 1-1/exp(1/exp(Z)) # P(Z) = 1-exp(-1/exp(Z)) # P(Z) = 1-exp(-exp(-Z)) # Parameters N1: residues in sequence 1; N2: residues in sequence 2; # and SWS: Smith-Waterman score for the alignment. # Solve for Z in terms to P(Z) to give Z = -log(-log(P(Z))) local ( \$N1, \$N2, \$SWS ) = @_; local ( @p1, \$pol, \$Z, \$Mean_SWS, \$SD_SWS, \$lCpr, \$Cpr ); \$n12 = \$N1 * \$N2; \$ln12 = log(\$n12); \$SD_SWS = 5.843705; \$Mean_SWS = \$ln12 * 5.843705 - 26.287011; \$Z = ( \$SWS - \$Mean_SWS ) / \$SD_SWS; # P(Z) = 1-exp(-exp(-Z)) # Note: for large positive Z, exp(-Z) -> 0 and P(Z) -> 0 as exp(-Z) # for large negative Z, exp(-Z) -> big and P(Z) -> 1 if ( \$Z > 20 ) { \$Cpr = exp ( -\$Z ) } else { \$expZ = exp(-\$Z); \$Cpr = 1-exp ( -\$expZ ); } if ( \$Cpr <= 0 ) { \$Cpr = 1e-200; } \$z1_glob = \$Z; return ( \$Cpr ); } #--------------------------- sub calc_structal_p_value_NR { #--------------------------- # This used N and R as arguments # Calculate the structal P-value from the analytical expression # use a sixth order polymonial in two seperate segment to approximate # the cumulative distribution of exp(-Z^4) # Parameters N: number of residues aligned, and R, the RMS. local ( \$N, \$R ) = @_; local ( @p1, \$pol, \$Z, \$Mean_lnR, \$SD_lnR, \$lCpr, \$Cpr ); local ( \$sum, \$iz, \$z, \$sum ); local ( \$lN ); # integrate the density numerically if ( !\$z4_table ) { \$z4_table = 1; \$ln60_gl = log(60); @para_gl = ( 0.155057, -0.619306, 1.726955, 0.092226, 0.212374); \$bpar_gl = 2.0*\$ln60_gl*\$para_gl[0] + \$para_gl[1]; \$apar_gl = (\$ln60_gl**2)*\$para_gl[0] + \$ln60_gl*\$para_gl[1] + \$para_gl[2] - \$bpar_gl*\$ln60_gl; \$sum = 0.0; \$zmin = -5.0; \$dz = 0.001; for ( \$iz = 0; \$iz <= 10000; \$iz++ ) { \$z = \$iz * \$dz + \$zmin; \$sum += \$dz*exp( -\$z**4 ); \$CDF[\$iz] = \$sum; } for ( \$iz = 0; \$iz <= 10000; \$iz++ ) { \$CDF[\$iz] /= \$CDF[10000]; } } \$lN = log(\$N); if ( \$N < 60 ) { \$Mean_lnR = \$lN*\$lN*\$para_gl[0] + \$lN*\$para_gl[1] + \$para_gl[2]; \$SD_lnR = \$lN*\$para_gl[3] + \$para_gl[4]; } else { \$Mean_lnR = \$lN*\$bpar_gl + \$apar_gl; \$SD_lnR = \$ln60_gl*\$para_gl[3] + \$para_gl[4]; } \$Z = ( log(\$R) - \$Mean_lnR ) / \$SD_lnR; # Get integral from a table \$iz = int ( ( \$Z - \$zmin ) / \$dz ); if ( \$iz < 0 ) { \$iz = 0 }; if ( \$iz > 10000 ) { \$iz = 10000 } \$Cpr = \$CDF[\$iz]; \$z1_glob = \$Z; return ( \$Cpr ); } #--------------------------- sub calc_structal_p_value_NS { #--------------------------- # Calculate the structal P-value from the analytical expression # Parameters N: number of residues aligned, # and S: the structural comparison score (Sstr) local ( \$N, \$MS ) = @_; local ( \$Z, \$Mean_MS, \$SD_MS, \$Cpr, \$expZ ); local ( \$lN, \$ln60, @para, \$bpar, \$apar ); \$lN = log(\$N); \$ln60 = log(120); @para = ( 18.411424, -4.501719, 2.637163, 21.351857, -37.521269); \$bpar = 2.0*\$ln60*\$para[0] + \$para[1]; \$apar = (\$ln60**2)*\$para[0] + \$ln60*\$para[1] + \$para[2] - \$bpar*\$ln60; # \$bpar corresponds to the coefficient a on page 5917 in # Levitt, M. and Gerstein, M. (1998) Proc. Natl. Acad. Sci. USA 95, 5913-5920. # \$apar corresponds to the coefficient b # Note that in the paper these values are reported incorrectly. The values # are switched and a negative sign is lost. According to the relationships # between a and b and the rest of the coefficients, the correct values are: # a = 171.8 and b = -419.3 # Calculating the values based on c, d, e, f, and g (as this routine does # above) will yield the correct values for a and b. if ( \$N < 120 ) { \$Mean_MS = \$lN*\$lN*\$para[0] + \$lN*\$para[1] + \$para[2]; \$SD_MS = \$lN*\$para[3] + \$para[4]; } else { \$Mean_MS = \$lN*\$bpar + \$apar; \$SD_MS = \$ln60*\$para[3] + \$para[4]; } \$Z = ( \$MS - \$Mean_MS ) / \$SD_MS; # P(Z) = 1-exp(-exp(-Z)) # Note: for large positive Z, exp(-Z) -> 0 and P(Z) -> 0 as exp(-Z) # for large negative Z, exp(-Z) -> big and P(Z) -> 1 if ( \$Z > 20 ) { \$Cpr = exp ( -\$Z ) } else { \$expZ = exp(-\$Z); \$Cpr = 1-exp ( -\$expZ ); } \$z1_glob = \$Z; return ( \$Cpr ); } return(1);