filename in_l ('d:\timeclus\jae\last_version\data\data_l.txt') ; 
filename in_f ('d:\timeclus\jae\last_version\data\data_f.txt') ; 
*filename in_l ('..data_l.txt') ; 
*filename in_f ('..data_f.txt') ; 

data new ; infile in_l ; 
   input YEAR L1-L15;
   label
L1= 'Per Capita GNP in 1982$'
L9= '% of Households With Telephones'
L2= '1/Infant Mort Rate Per 1000 Births'
L10= '% of Households With Radios'
L3='Male Life Expectancy At Birth'
L4='Female Life Expectancy At Birth'
l11='100,000 per Homicide'
L5='100 - % Unemployment'
L12='% of Ages 5-17 in School'
L6='Per Capita Dis Income in 1982$'
L13='Newspapers per capita'
L7='Physicians Per 100,000'
L14='Rate of Real GNP Growth (1982$)'
l15='% of GNP not for Defense'
L8='Surfaced Highway Miles per capita';
run; 

data new2 ; infile in_f ; 
   input YEAR f1-f15; 
label f1='Arima residual for l1'
      f15='Arima residual for l15' ; 

run; 
data new ; merge new new2 ; run; 
run; 
 proc format ;
     VALUE  nnc
   1 = '      Per capita GNP'
   2 = '1 / Infant Mortality'
   3 = 'Male life expectancy'
   4 = 'Female life expectan'
   5 = '     Employment rate'
   6 = ' Per capita Disp inc'
   7 = 'Physicians / 100,000'
   8 = 'Hyw miles per capita'
   9 = ' % homes with phones'
  10 = '  % homes with radio'
  11 = '  100,000 / Homicide'
  12 = '% age 5-17 in school'
  13 = 'Newspapers / cap'
  14 = '  GNP rate of growth'
  15 = '   % GNP not defense';
     VALUE  nnca
   1 = 'GNP / cap(L1) '
   2 = '1 / Inf Mort(L2)'
   3 = 'M life exp(L3)  '
   4 = 'F life exp(L4)  '
   5 = 'Empl rate(L5)'
   6 = 'D inc/cap(L6)'
   7 = 'Phys / cap(L7)'
   8 = 'Hyw  / cap(L8)'
   9 = 'phones/ HH(L9)'
  10 = 'radios/HH(L10)'
  11 = 'cap / Hom(L11)'
  12 = '5-17 in sch(L12)'
  13 = 'Newsp / cap(L13)'
  14 = 'GNP growth(L14)'
  15 = '% GNP n def(L15)';
     VALUE  $nncc
  '1' = '      Per capita GNP'
  '2'= '1 / Infant Mortality'
  '3'= 'Male life expectancy'
  '4'= 'Female life expectan'
  '5'= '     Employment rate'
  '6'= ' Per capita Disp inc'
  '7'= 'Physicians / 100,000'
  '8'= 'Hyw miles per capita'
  '9'= ' % homes with phones'
 '10'= '  % homes with radio'
 '11'= '  100,000 / Homicide'
 '12'= '% age 5-17 in school'
 '13'= '    Newspapers / cap'
 '14'= '  GNP rate of growth'
 '15'= '   % GNP not defense';
*
   Routine to run Cluster analysis on the US QLF data by year
;

* title height=1. font=swissb 'Z-scores Normal kernels-100 1/4 bandwidth ' ; run;
PROC IML;
* -------------------------------------------------------------------;
  start kernel(z,f,xmin, xmax, nx, xinc) ;
   xinc = (xmax - xmin) / nx ;
   n=ncol(z) ; mz= z(|,:|) ; ssz = z(|,##|) ;
  vz = ssz-n#mz#mz; sz = sqrt(vz/n) ;
  h = sz # 3.49 # n ## (-1/3) ; * Scott bandwidth ;
  h = h /4 ;                                        *************** ;
 *  h = sz # 2 # n ## (-1/5) ; * Bowman bandwidth ;
  x = ((1:nx) # xinc ) + j(1,nx,xmin-xinc);
  f = j(1,nx,0) ;
  hn = h#n ; sqfive = sqrt(5) ; sqtwopi = sqrt(2 * 2#arsin(1)) ;
*
   Cycle through the values of x and compute the kernel for each case
;
   do jj = 1 to nx ;
     t =  ( j(1,n,x(|,jj|)) - z ) / h  ;
*
 *    Rectangular kernel
;
 *  f(|,jj|) =  sum ( ( abs(t) < j(1,n,1) ) ) /(2 # hn) ;
*
     Triangular  kernel
;
  *  f(|,jj|) =  sum ( ( abs(t) < j(1,n,1) )  #
                      ( j(1,n,1) - abs(t) )
                      ) /hn ;
*
    Epanechnikov
;
 *    f(|,jj|) = sum ( ( abs(t) < j(1,n,sqfive) ) #
                    (.75 # ( j(1,n,1) - .2 # t#t) / sqfive )
                    ) /hn;
*
    Normal
;
    f(|,jj|) = sum (
                    (exp (- .5 # t#t) / sqtwopi )
                    ) /hn;
   end;
   f = f / ( sum(f) # xinc ) ;  * normalize density to integrate to one;
finish ;
*--------------------------------------------------------------------;
START MAIN;

  LINK READX;
  CLUSTDAT = J(2*NIN-1,2,0);
  NDAT = J(2*NIN-1,1,1) ;
  CLUSTDAT(|1:(2*NIN-1),1|) = (1:(2#NIN-1))`;
  ID = CLUSTDAT(|1:NIN,1|) ;
  CLID = ID||J(NIN,1,1) ;
  CMEM = J(NIN,NIN,0) ;
  CMEM[,1] = sic;
  DMEM = J(NIN,NIN,0) ;
  DMEM[,1] = (1:nin)`;
  LINK DISTMAT;
  DO N = (NIN-1) TO 1 BY -1;
    LINK MINDIST;
    LINK COMBINE;
    LINK REDOID;
    LINK WRITEID;
    LINK REDIST; print n d [format=3.] ;
  END;
  LINK OUTREE ;
STOP;

READX: *---------------------------------------------------;

       use new ; * out61.usqual;
       read all var ('f1':'f15') into r where (year > 1915) ;    * Arima residuals ;
    
       read all var ('l1':'l15') into x[colname=obnames]
                                 where ( year > 1915) ;
*
    f - residual from ARIMA, l - actual data ;
;
       nin = ncol(x) ; k = nrow(x) ;
       sic = (1:nin)` ;
*
    Compute the kernel density for each residual
;
        rn = 1000;                                               ******* ;
        f = j(rn,nin,0) ;
        rmin = min(r) # 1.5 ;
        rmax = max(r) # 1.5 ;
        armax = max(abs(r)) ;
        rmin = armax # -1.5 ;                                   ******;
        rmax = armax # 1.5 ;
        rrange = rmax - rmin ;
        do ii = 1 to nin ;
          tr = r(|,ii|)`;
          run kernel( tr, tf, rmin, rmax, rn, rinc ) ;
         f(|,ii|) = tf`;
         temp = j(rn,1,ii) || ((1:rn)`) || tf ` ;
       
        if ii = 1 then create denplt from temp ;
                  else setout denplt; append from temp ;

       end;
       t_f = f ; * the density that does not change ;
       t_x = x ; * the values of series that do not change ;
       xxs = sic||x` ;
       create xxs from xxs ; append from xxs ;
       ffs = (1:rn)`||f ;
       fsnames = 'x' || obnames ;
       create ffs from ffs[colname=fsnames] ; append from ffs ;
       free tr tf temp ffs xxs fsnames;
RETURN;



DISTMAT: *---------------------------------------------------;

*  COMPUTE THE FULL DISTANCE MATRIX ;
  corrx =  corr(x) ; print corrx [format = 4.2] ;
  D = (j(nin,nin,1) - corrx ) # 50 ; * Use 1 - the correlation matrix go to dm1 ;
 *go to dm1;                                                    *********************;
  D = J(NIN,NIN,0); dx = d ; dn = d ;
  DO I1 = 1 TO NIN;
    DO J1 = 1 TO NIN;
      IF J1 < I1 THEN do;
      LINK CALCDIST; link calcdisx ; link calcdisn ;
      end ;
    END;
  END;
   d = (d / k) # 100 ;
   dn = (dn / k) # 100 ;
   dx = dx # 100 ;
  * d = d - dx ; * substract the distance due to uncertainty ;   **********************;
  * d = dx ; * use the uncertainty only measure Dr fig 8 ;       **********************;
  * d = dn ; * use the Normal common variance measure Dn fig 7;  **********************;
dm1:  dtemp = d + d` ;
     PRINT sic [format = nnc.] Dtemp [format = 3. colname=obnames rowname=obnames] ;
create distance from dtemp[colname=obnames rowname=obnames];
        append from dtemp[colname=obnames rowname=obnames] ;
  d = d + d` ;
  t_d = d ;
  free corrx dtemp dx;
RETURN;


MINDIST: *---------------------------------------------------;

  MIND = {1E60};
  DO II=1 TO N+1;
    I1=ID(|II,|);
    DO JJ=1 TO N+1;
      IF JJ < II THEN DO ;
        J1=ID(|JJ,|);
        IF D(|I1,J1|) < MIND THEN DO;
          MIND = D(|I1,J1|);
          IM = I1;
          JM = J1;
          IDM = II;
          JDM = JJ;
        END;
      END;
    END;
  END;
RETURN;

 COMBINE: *---------------------------------------------------;
 *  year locations ;
  am = cmem[jm,loc(cmem[jm,]) ] ;
  nm = cmem[im,loc(cmem[im,]) ] ;
  tm = am||nm ;
  cmem[jm,1:ncol(tm)] = tm ;

  *  relative locations ;
  am = dmem[jm,loc(dmem[jm,]) ] ;
  nm = dmem[im,loc(dmem[im,]) ] ;
  tm = am||nm ;
  dmem[jm,1:ncol(tm)] = tm ;
  /*  **************** Old code for using averages *************
*
   Use the average of the two time series
;
    x(|,jm|) = ( x(|,jm |) + x(|,im|) )/ 2 ;
    xmin = x(|><,jm|) ; xmax = x(|<>,jm|)  ;
    x(|,jm|) = ( x(|,jm|) - xmin ) / ( xmax - xmin ) ;
*
   Recompute the density of the two residual sets
;
    r(|,jm|) = ( r(|,jm |) + r(|,im|) )/ 2 ;

    tr = r(|,jm|)`;
    run kernel(tr,tf) ;
    f(|,jm|) = tf`;
    xxs =  (n||x[,jm]`) ; setout xxs ; append from xxs ;
    ffs =  (n||tf ) ;  setout ffs ; append from ffs ;
    free tr tf xxs ffs ;
  */
RETURN;

CALCDISt: *---- Compute the rho star measure --------------------;
     obmin = min(x) ;
                     obmax = max(x) ;
    obrange = obmax - obmin ;
    cmax = obmax + abs(rmax) ;
    cmin = obmin - abs(rmin) ;
    cn = abs(int((cmax-cmin)/rinc)) + 2 ;
       d[i1,j1] = 0 ;
       do ob = 1 to k ; * cycle over the observations;
       f1 = j(cn,1,0) ;
       f2 = f1 ;
       xv = x[ob,i1] - abs(rmin);
      
       loc1 = int ( abs (xv - cmin) / rinc)  + 1 ;
       id1 = loc1 : (loc1 + rn -1) ;
       f1(|id1,|) = f(|,i1|) ;
       xv = x[ob,j1]  - abs(rmin);
      
       loc1 = int ( abs  (xv - cmin) / rinc)  + 1 ;
       id1 = loc1 : (loc1 + rn -1) ;
       f2(|id1,|) = f(|,j1|) ;
       d(|i1,j1|) = ( 1 - sum(sqrt(f1#f2))# rinc ) + d(|i1,j1|) ; ******* Information based method ;
     *  d(|i1,j1|) = ( sum(abs(f1-f2))# rinc )  + d(|i1,j1|) ;   ******* l1 norm method ****** ;
     *  d(|i1,j1|) = ( sqrt(sum((f1-f2)##2)# rinc ))  + d(|i1,j1|) ;   ******* l2 norm method ****** ;
       end;
    free id1 f1 f2 ;
RETURN;
CALCDISx: *---- Compute the rho star measure --- where the observations don't move-----------------;
*    Only the shape of the density estimate matters in the construction of dx ;

     obmin = min(x) ;
                     obmax = max(x) ;
    obrange = obmax - obmin ;
    cmax = obmax + abs(rmax) ;
    cmin = obmin - abs(rmin) ;
    cn = abs(int((cmax-cmin)/rinc)) + 2 ;
       dx[i1,j1] = 0 ;
       do ob = 1 to 1 ; * cycle over the observations use only one observation;
       f1 = j(cn,1,0) ;
       f2 = f1 ;
    *  xv = x[ob,i1] - abs(rmin);
       xv = .5 - abs(rmin);        ***********************to not use variation******;
       loc1 = int ( abs (xv - cmin) / rinc)  + 1 ;
       id1 = loc1 : (loc1 + rn -1) ;
       f1(|id1,|) = f(|,i1|) ;
     * xv = x[ob,j1]  - abs(rmin);
      xv = .5  - abs(rmin);       ***********************to not use variation******;
       loc1 = int ( abs  (xv - cmin) / rinc)  + 1 ;
       id1 = loc1 : (loc1 + rn -1) ;
       f2(|id1,|) = f(|,j1|) ;
       dx(|i1,j1|) = ( 1 - sum(sqrt(f1#f2))# rinc ) + dx(|i1,j1|) ;
       end;
    free id1 f1 f2 ;
RETURN;
CALCDISn: *---- Compute the Normal rho star measure ---
*
        where the series are assumed to be normal and the variances are assumed to be
         equal to one.
;
*    ;
       dn[i1,j1] = 0 ;
       do ob = 1 to k ; * cycle over the observations;
       mx1 = x[ob,i1] ;
       mx2 = x[ob,j1] ;
       dn(|i1,j1|) = ( 1 - exp( ( (mx1-mx2)##2 ) / -8 )) + dn(|i1,j1|) ;
       end;
RETURN;



CALCDIS2: *----linkage version ---------------------------------;
  iset = dmem[i1,loc(dmem[i1,]) ] ; tdmemi1 = dmem[,i1] ;
  jset = dmem[j1,loc(dmem[j1,]) ] ; tdmemj1 = dmem[,j1] ;

  nis = ncol(iset) ;
  njs = ncol(jset);

        do ri = 1 to nis ;
           rn = iset[,ri] ;
             do ci = 1 to njs ;
             cn = jset[,ci] ;
               if cn > rn then d_t = d_t // t_d[cn,rn] ;
                          else d_t = d_t // t_d[rn,cn] ;
        e2a: end;
        end;
  *  D[I1,J1] = max(d_t) ;  * total Linkage is Max distance ;
  *  D[I1,J1] = min(d_t) ;  * single Linkage is Min Distance;
     D[I1,J1] = d_t[:,]  ;  * Average Linkage is Mean Distance;

   free d_t;
RETURN;


WRITEID: *---------------------------------------------------;
  IDT=ID`;
  PRINT "NUMBER OF CLUSTERS LEFT" N
        IM (|ROWNAME="COMBINED" format=nnc. |)
        JM (|ROWNAME="WITH" format=nnc.|)
        mind (|ROWNAME="DISTANCE"|) ;
  PRINT IDT (|ROWNAME="CLUSTERS"|);
  select = n || mind || k;
  if n = nin-1 then create select from select ;
               else setout select ;
  append from select ;
  sel = sel // SELECT(|,1:3|)  ;
RETURN;

REDOID: *---------------------------------------------------;

  IF IDM = N+1 THEN ID = ID(|1:N,|)//J(NIN-N,1,0) ;
    ELSE ID = ID(|1:IDM-1,|)//ID(|IDM+1:N+1,|)//J(NIN-N,1,0) ;
  TOTIN = CLID(|JDM,2|) + CLID(|IDM,2|) ;
  PARENT = 2*NIN-N ;
  NDAT(|PARENT, |) = TOTIN ;
  CLUSTDAT(|CLID(|JDM,1|),2|) = PARENT ;
  CLUSTDAT(|CLID(|IDM,1|),2|) = PARENT ;
  CLID(|JDM, |) = PARENT||TOTIN ;
  IF IDM = N+1 THEN CLID = CLID(|1:N,|)//J(NIN-N,2,0) ;
    ELSE CLID = CLID(|1:IDM-1,|)//CLID(|IDM+1:N+1,|)//J(NIN-N,2,0) ;
RETURN;

REDIST: *---------------------------------------------------;

  DO II = 1 TO N+1 ;
    I1A = ID(|II,1|) ;
    IF I1A > 0 THEN DO ;
      J1A = JM ;
      IF I1A > J1A THEN DO ;  * NEW ROW ELEMENT ;
        I1 = I1A ;
        J1 = J1A ;
        LINK CALCDIS2;
      END;
      IF I1A < J1A THEN DO ;  * NEW COLUMN ELEMENT ;
        I1 = J1A ;
        J1 = I1A ;
        LINK CALCDIS2 ;
      END;
    END;
  END;
RETURN;

OUTREE: *---------------------------------------------------;

  VARNAMES = {CLNUM PARNUM} ;
  PRINT CLUSTDAT  NDAT ;
  TDAT = CHAR(CLUSTDAT,4,4) ;
  CREATE TREEDAT FROM TDAT (|COLNAME=VARNAMES|) ;
  APPEND FROM TDAT ;
  CREATE NUMDAT FROM NDAT (|COLNAME={NUMIN}|) ;
  APPEND FROM NDAT ;
  vnames1 = { _NCL_ _DIST_ _PDIST_ } ;
  SEL = (J(NIN,1,NIN) || J(NIN,2,0) ) // SEL ;
  CREATE SEL FROM SEL (|COLNAME=VNAMES1|);
  APPEND FROM SEL ;

RETURN;

FINISH;

RUN MAIN;

*-----------------------------------------------------------------;
data dist ; set distance ; length rowtype_ $ 8 varname_ $ 8 ;
  rowtype_ = 'PROX' ;varname_ = obnames ;
keep rowtype_ varname_ ; run; 
data dist ; merge dist distance ; drop obnames ;
label
  l1 = 'Per capita GNP'
   l2 = '1 / Infant Mort'
   l3 = 'Male life expect'
   l4 = 'Female life expect'
   l5 = 'Employment rate'
   l6 = 'Per cap Disp Inc'
   l7 = 'Phys / 100,000'
   l8 = 'Hyw miles per cap'
   l9 = '% HH with phones'
  l10 = '% HH with radio'
  l11 = '100,000 / Homicide'
  l12 = '% age 5-17 in sch'
  l13 = 'Newspap/ cap'
  l14 = 'GNP rate of growth'
  l15 = '% GNP not Defense';
proc print data=dist ; run;
libname outxp xport ('d:\timeclus\distance.xp') ;
data select ; set select ;
  diff = col2 - lag(col2) ;
  ratio = (lag(col2) - lag2(col2)) / diff;
  label col1 = 'No of clusters'
        col2 = 'Dist to combine'
        ratio = 'Change in angle'
        diff = 'Change in dist' ;
proc copy in=work out=outxp ; select dist select ; run;
proc print data=select label split=' ';
proc univariate plot data=select ; var col2 diff ratio;

goptions htitle = 1. ftext=swiss ;
axis1  width = 4 label = ( height=1. 'Number of Clusters') Value=(height=.8)
      major= (width=4) minor= none ;
axis2 value=(height=.8 ) label = ( height=1. ) width=4 major=(width=4) minor=(width=2);
symbol1 value=star i=join width=10 ;
proc gplot data=select; plot (diff col2 ratio  ) * col1 / haxis=axis1 vaxis=axis2 ; run;
proc plot data=select; plot (diff col2 ratio  ) * col1  ; run;
DATA FULLDAT ;
  MERGE TREEDAT NUMDAT ;
  IF INPUT(PARNUM,4.) =0 THEN PARNUM = '    ';
  format clnum  $nncc.  ;
  clnum1 = input(clnum,4.)  ;
  parnum1 = input(parnum,4.) ;
  format clnum1  nnca. ;
* CLNUM = PUT(NCLNUM,4.) ;
* PARNUM = PUT(NPARNUM,4.) ;
 DATA FULLDAT ;
  MERGE FULLDAT SEL ;
 proc contents ;
 proc print ;
axis1 order=(0 to 100 by 10) width = 4 label = ( height=1. 'Distance') Value=(height=.8)
      major= (width=4) minor= (width=2) ;
axis1  width = 4 label = ( height=1. 'Distance') Value=(height=.8)
      major= (width=4) minor= (width=2) ;
axis2 value=(height=.8 ) label = ( height=1. 'Series') width=2;
PROC TREE graphics DATA=FULLDAT SORT HEIGHT=H HOR haxis=axis1 lines = (width=20) vaxis=axis2
            ;
 HEIGHT _DIST_ ;
  NAME CLNUM1 ; id clnum1 ;
  PARENT  PARNUM1 ;
  FREQ  NUMIN ; run;
axis3  width = 4 label = ( height=1. 'Number of Clusters') Value=(height=.8)
      major= (width=4) minor= none ;
PROC TREE graphics DATA=FULLDAT SORT HEIGHT = N HOR  haxis=axis3 lines = (width=20) vaxis=axis2;
  NAME CLNUM1 ; id clnum1 ;
  PARENT  PARNUM1 ;
  FREQ  NUMIN ; run;
PROC TREE DATA=FULLDAT SORT HEIGHT=H HOR ;
 HEIGHT _DIST_ ;
  NAME CLNUM1 ; id clnum1 ;
  PARENT  PARNUM1 ;
  FREQ  NUMIN ; run;
PROC TREE DATA=FULLDAT SORT HOR ;
  NAME CLNUM1 ; id clnum1 ;
  PARENT  PARNUM1 ;
  FREQ  NUMIN ;   run;
proc print ;
 PROC TREE DATA=fulldat SORT noprint
 out=part nclusters=4 ; id clnum1 ;
 name clnum1 ; parent parnum1 ; freq numin ;
 proc sort ; by cluster ;
 proc print ; by cluster ; var clnum1 ; run;

