; DO NOT RUN THIS AGAIN AS IT KEEPS ITERATING!!!!!!! DO: .run obs_mslp_as .run obs_mslp_as_infill .run obs_mslp_as_infill_iter2 .run obs_mslp_as_infill_iter2 THEN STOP!!!!! ; ; SECOND ITERATION OF INFILLING!!!! ; Infills missing Apr-Sep MSLP values by using PCR-based method. ; One difficulty is that some EOFs of the hemispheric SLP have dominance ; in particular regions, but not in others. To infill some points, therefore, ; some EOFs are of little use. We could simply increase the number of EOFs ; to retain, to make sure some are kept that have power in each location, but ; this raises the chance of overfitting. So, what I now do is to perform ; four different PCAs on (1) 0-180, (2) 90-270, (3) 180-360, and (4) 270-90 ; portions of the SLP data, and then use the closest region to each point: ; for 45-134 use (1), for 135-224 use (2), for 225-314 use (3), and for ; 315-44 use (4). ; minr=0.55 ; minimum validated correlation acceptable for infilling ; 50%=0.71 40%=0.63 30%=0.55 ; ; Restore Apr-Sep MSLP gridded dataset ; restore,filename='obs_mslp_as.idlsave' ; timey,fdseas,xlon,ylat,nyr,nx,ny,fdltm,fdsd,missfrac kkk=missfrac restore,filename='obs_mslp_as_infill.idlsave' ; timey,fdseas,xlon,ylat,nyr,nx,ny,fdltm,fdsd,missfrac ; ; Building of the PCR model requires some series with complete data. ; If we use all grid boxes, we will be limited to using 30% of the boxes, ; because 70% have at least some years missing. If we omit the 15 years ; with least coverage, then 69% of the boxes will have complete coverage. ; We can then use the PCR model to infill values as best as we can for ; the subset of years used, but must use some other method for the 15 omitted ; years. ; ; Select the years to analyse ; ksub=where(missfrac le 0.19,nsub) if nsub ne 73 then message,'Ooops!' ; fdsub=fdseas(*,*,ksub) yrsub=timey(ksub) ; ; Perform PCA on subsetted data; automatically uses only those with complete ; data ; nregion=5 ; 4 overlapping regions for different PCA, plus region 0 is ALL usemode=[5,3,4,4,5] ; use only 5 modes now we've got less data nretain=8 ; number of modes to use, a balance between overfitting and fitting cormat=0 ; 0=covariance PCA, 1=correlation PCA ; for iregion = 0 , nregion-1 do begin case iregion of 0: fd1=fdsub 1: begin keepx=where(xlon le 175) fd1=fdsub(keepx,*,*) end 2: begin keepx=where((xlon ge 90) and (xlon le 265)) fd1=fdsub(keepx,*,*) end 3: begin keepx=where(xlon ge 180) fd1=fdsub(keepx,*,*) end 4: begin keepx=where((xlon ge 270) or (xlon le 85)) fd1=fdsub(keepx,*,*) end endcase ; myeof2d,fd1,ev1,ea1,lam,lampct,lamcum,nretain=nretain,correlation=cormat ; if iregion eq 0 then begin ev=ev1 ea=fltarr(nsub,nretain,nregion) endif ea(*,*,iregion)=ea1(*,*) endfor ; ; Make a field to contain status of each box (0=unfilled 1=infilled 2=complete) ; ;fdstatus=fltarr(nx,ny) fddum=reform(ev(*,*,0)) kcomp=where(finite(fddum)) kmiss=where(finite(fddum) eq 0,nmiss) ;fdstatus(kcomp)=2. ; ; For each box with some missing data, try to infill with PCR ; print,'Number to infill:',nmiss for imiss = 0 , nmiss-1 do begin print,imiss,format='($,I5)' ; ; Extract MSLP series and check for sufficient non-missing data ; locx=kmiss(imiss) mod nx xxx=xlon(locx) if (xxx ge 45) and (xxx le 134) then iregion=1 if (xxx ge 135) and (xxx le 224) then iregion=2 if (xxx ge 225) and (xxx le 314) then iregion=3 if (xxx ge 315) or (xxx le 44) then iregion=4 print,xxx,iregion,format='($,I4,I2)' nnn=usemode(iregion) locy=fix(kmiss(imiss)/nx) onets=reform(fdsub(locx,locy,*)) kgot=where(finite(onets),ngot) kwant=where(finite(onets) eq 0,nwant) ; ; Because of the need for separate testing/training sets, need at least 30 ; years of data ; if ngot ge 30 then begin ; ; Split data into two equal segments ; setlen1=fix(ngot/2) setlen2=ngot-setlen1 kset1=kgot(0:setlen1-1) kset2=kgot(setlen1:ngot-1) ; for iter = 0 , 2 do begin ; repeat for test,training & vice versa & full case iter of 0: begin ktrain=kset1 ktest=kset2 end 1: begin rmean=rskill ktrain=kset2 ktest=kset1 end 2: begin rmean=(rmean+rskill)*0.5 print,rmean,format='($,F7.2)' ktrain=kgot ktest=kwant ; this is in fact the predictions we want, cannot test it end endcase ; ; Build PCR (PC-based multiple linear regression) ; depts=onets(ktrain) indts=transpose(ea(ktrain,0:nnn-1,iregion)) wts=fltarr(n_elements(ktrain))+1. pcrcoeff=regress(indts,depts,wts,yfit,pcrconst,pcrsigma,pcrf,lincorr,$ mulcorr,pcrchi,pcrstat,/relative_weight) ; ; If regression failed, set skill to zero, else evaluate skill ; if pcrstat ne 0 then begin rskill=-9.99 endif else begin depts=onets(ktest) indts=transpose(ea(ktest,0:nnn-1,iregion)) nval=n_elements(ktest) prets=fltarr(nval) for ival = 0 , nval-1 do begin prets(ival)=pcrconst+total(pcrcoeff*indts(*,ival)) endfor rskill=correlate(depts,prets) endelse print,mulcorr,rskill,format='($,2F7.2)' ; ; Determine whether skill is high enough, and store reconstruction ; if it is ; if iter eq 2 then begin if rmean ge minr then begin fdstatus(locx,locy)=1. print,' INFILLED' fdsub(locx,locy,kwant)=prets(*) endif else begin print,' FAILED' endelse endif ; endfor ; endif else begin print,' TOO SHORT:',ngot endelse ; endfor ; dummy=where(fdstatus eq 1,ninfill) print,'Total infilled = ',ninfill ; ; Plot layout of missing, infilled and complete data boxes ; loadct,39 multi_plot,nrow=2 def_1color,10,color='white' def_1color,11,color='black' def_1color,12,color='mgrey' map=def_map(/npolar) & map.limit(0)=15. coast=def_coast(/get_device) & coast.double=0 sm=def_sm(/off) & sm.cyclic=0 inter_boxfd,fdstatus,xlon,ylat,map=map,$ coast=coast,sm=sm,$ levels=[-0.5,0.5,1.5,2.5],c_colors=[10,11,12] ; ; Put infilled subset back into full dataset ; fdseas(*,*,ksub)=fdsub(*,*,*) ; ; Also produce a year-by-year timeseries of the fraction of missing data ; nall=total(finite(fdltm)) nmiss=float(total(finite(fdseas),1)) nmiss=1.-(total(nmiss,1)/nall) missfrac=nmiss ; plot,timey,nmiss,psym=10,/xstyle,xtitle='Year',$ ytitle='Fraction of data missing per summer',yrange=[0,0.5],/ystyle oplot,timey,kkk,thick=3,psym=10 ; ; Now save the results in the same 'format' as the uninfilled data, so they ; can be switched at ease ; save,filename='obs_mslp_as_infill.idlsave',$ timey,fdseas,xlon,ylat,nyr,nx,ny,fdltm,fdsd,missfrac,fdstatus ; end