/* :: np_dens.gau :: :: Driver program to demonstrate use of np_dens() proc: :: Create a vector of normal random variables, provide :: nonparametric density estimates for a wide variety :: of bandwidth choices. Plot the bandwidths against :: the cross-validation scores, and plot the estimated :: density using the bandwidth that minimizes the least :: squares cross validation score function. :: :: Mico Loretan, March 1996 */ library pgraph; #include pgraph.ext; graphset; _pdate=""; n = 10000; /* sample size */ a = -5; /* lower endpoint of estimation interval */ b = 5; /* upper endpoint */ M = 2^10; /* number of points in [a,b] at which to estimate the density of x */ h = seqa(0.01,0.01,50); /* vector of bandwidths */ /* the "optimal" bandwidth will be around 0.15 for this sample size and type of distribution; I choose a wide range of candidate bandwidths to demonstrate how the np_dens() proc may be utilized in practice. */ rndseed 6376347; x = rndn(n,1); /* create the "data:" N(0,1) variates */ /* ** estimate a density function (over xgrid) for every value of h, ** and calculate the associated least squares cross validation ** score */ {xgrid,fhat,cv} = np_dens(x,a,b,M,h); #ifUNIX vv = { 0,0,640,480,40,80,1,6,15,0,0,2,2 }; call WinSetActive(WinOpenPQG(vv,"npdens","")); #endif /* First graph: plot bandwidths against cross validation scores */ xtics(0,0.5,0.1,10); xlabel("bandwidths"); ylabel("cv scores"); xy(h,cv); #ifUNIX vv = { 100,100,640,480,40,80,1,6,15,0,0,2,2 }; call WinSetActive(WinOpenPQG(vv,"npdens","")); #endif /* Second graph: plot the estimated density for the "optimal" bandwidth parameter, along with the "true" N(0,1) density */ xtics(-4,4,1,1); ytics(0,0.45,0.05,1); xlabel("Normal Distribution"); ylabel("density"); xy(xgrid,fhat[.,minindc(cv)]~pdfn(xgrid)); #ifUNIX call WinSetActive(1); #endif /* :: np_dens.src :: :: Compute estimates of a (univariate) density, using a Gaussian :: kernel and FFT/inverse FFT methods, for various bandwidths. :: For use with "complex" versions of Gauss only. :: :: April 1991, April 1992; revised March 1996. :: :: Author: Mico Loretan :: email: loretanm@frb.gov :: worK: +(202) 452-2219 :: :: Note: This code may be distributed freely, as long as this file :: is distributed in its entirety, including this notice. :: All modifications are made at your own risk. No warranties made. :: Please report problems and/or suspected bugs to the code's author. */ proc (3)=np_dens(x,a,b,M,h); /* :: Inputs: x data vector, length n :: a,b scalars, interval over which to compute :: the density estimates (needed: amaxc(x)+3*maxc(h); otherwise, :: you will get aliasing problems. :: M scalar, number of grid points between a and b; :: this number must be a power of 2. :: h vector of "bandwidths", length k :: (needed: h_i>0 forall i) :: :: Outputs: t points at which density was evaluated :: f matrix, of order M by k :: each column corresponds to one bandwidth :: cv least-squares cross validation scores, :: vector of length k :: :: Note: The algorithm and the names of variables are based on :: B. W. Silverman, "Density Estimation," Chapman and Hall, 1986, :: especially pp. 63--66. */ local t, f, cv, n, d, c, i, k, j, xx, xi, y, l, s, zstar, sh, psi0; /* :: Step 0: Check for valid inputs, discard all data points :: not in [a,b], and sort remaing data in ascending order. */ if (minc(h) <= 0); errorlog "Bandwidth parameter(s) MUST be positive ..."; end; endif; if (a >= b); errorlog "Lower endpoint of estimation interval MUST be"; errorlog " small than upper endpoint ..."; end; endif; j = ln(M)/ln(2); /* check if M is a power of 2 */ if ( abs(j-round(j)) > 1e-9); errorlog "M MUST be a power of 2..."; end; endif; x = delif( x, (x.b) ); if (scalmiss(x)); errorlog "No datapoints in interval [a,b] ..."; end; endif; x = sortc(x,1); /* data must be sorted for next step */ n = rows(x); /* number of (remaining) datapoints */ /* :: Step 1: Discretize the data on an M-element grid over [a,b]. */ d = (b-a)/(M-1); /* set width of grid steps */ t = seqa(a,d,M); /* grid: t[1]=a, t[M]=b */ c = counts(x,t); /* number of data points falling into each grid interval; c has M entries */ c = c[2:M]; /* discard first entry of c (=0) */ xi = zeros(M,1); /* initialize weights vector */ k = 1; j = 1; do until (k==M); /* loop over each grid point */ if (c[k]>0); /* pick off data points in (t[k],t[k+1]) */ xx = x[j:j+c[k]-1]; /* compute weights at t[k] and t[k+1] */ xi[k] = xi[k] + sumc(t[k+1]-xx); xi[k+1] = xi[k+1] + sumc( xx-t[k]); j = j+c[k]; endif; k = k+1; endo; xi = xi/(n*d*d); /* normalize: weights must add up to 1/d */ /* :: Step 2: Compute the FFT of the weight sequence xi, :: and create the auxiliary sequence s. :: (cf. Silverman, page 64) */ y = rfft(xi); /* y is a complex vector of length M */ l = seqa(0,1,M/2+1) | seqa(-M/2+1,1,M/2-1); /* :: This odd-seeming ordering is required due to the :: way the Gauss FFT algorithm orders its output. */ s = l*2*pi/(b-a); /* :: Steps 3 & 4: For each bandwidth, compute the vector zstar :: and the corresponding density estimate, f, via an inverse FFT. */ f = zeros(M,rows(h)); i = 1; do until (i>rows(h)); zstar = exp(-(s^2*h[i]^2)/2) .* y; f[.,i] = abs(ffti(zstar)); /* alternatively, use f[.,i] = rffti(zstar) */ i = i+1; endo; /* :: Step 5: Compute the least squares cross validation score :: for each density estimate. */ y = y[2:M/2+1]; /* we only need half the "data" here */ s = s[2:M/2+1]; sh = -s^2.*(h^2)'; /* outer product: matrix of order M/2 by k */ disable; /* disable complaints about underflows... */ psi0= -1+2*(b-a)*sumc( (exp(sh)-2*exp(0.5*sh)).*(abs(y)^2) ); ndpclex; enable; cv = psi0 + 2./(n*sqrt(2*pi)*h); /* equation 3.62, p. 65 */ retp(t,f,cv); endp; /* np_dens() */