function load_data_5industries_monthly_FINAL(ind_no)

% PURPOSE: load the data for the 5 industry portfolios from Fama French and
% merge it with Goyal-Welch data to compile list of predictors for
% industry # ind_no (ind goes from 1 to 5)
% --------------------------------------------------------------------------
% USAGE: load_data_5industries_monthly_FINAL(ind_no)
% -------------------------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% First load the monthly Fama French data, value weighted returns 
%DATA = xlsread('5_Industry_Portfolios.xlsx','VW_ret');
DATA = csvread('5_Industry_Portfolios.csv',1,0);
year=floor(DATA(:,1)/100);
monthpart=DATA(:,1)/100-floor(DATA(:,1)/100);
month=floor(monthpart*100+.00001);
dates_FF=datenum(year,month,ones(length(year),1));
ind_ret = DATA(:,2:end)/100;
clear('DATA','year','monthpart','month');

%% Next load the monthly DP, EP, and BM ratios computed for each industry using Compustat and CRPS individual firm data
[DATA,textdata] = xlsread('Industry_Data_for_Matlab.csv');
DATA(DATA==-999.99) = NaN;
if size(DATA,2) ~= size(textdata,2)
    error('DATA and textdata should have the same number of columns');
end

year=DATA(:,1);
month=DATA(:,2);
dates_CRSP=datenum(year,month,ones(length(year),1));

% Line up data from CRPS/Compustat with FF data
if min(dates_CRSP) < min(dates_FF)
    indx_FF_start = find(dates_CRSP==dates_FF(1));
else
    indx_FF_start = 1;
end
if max(dates_CRSP) > max(dates_FF)
    indx_FF_end   = find(dates_CRSP==dates_FF(end));
else
    indx_FF_end = size(ind_ret,1);
end
ind_ret    = [NaN(indx_FF_start-1,size(ind_ret,2));ind_ret(1:indx_FF_end,:)];

% Rearrange industry predictors from CRPS/Compustat file into matrices
DP_industry = NaN(size(ind_ret,1),size(ind_ret,2));
DY_industry = NaN(size(ind_ret,1),size(ind_ret,2));
EP_industry = NaN(size(ind_ret,1),size(ind_ret,2));
BM_industry = NaN(size(ind_ret,1),size(ind_ret,2));
NE_industry = NaN(size(ind_ret,1),size(ind_ret,2));

for i=1:size(ind_ret,2)
    indx_DP = strcmp(textdata,['DP',num2str(i)]);
    DP_industry(:,i) = DATA(:,indx_DP==1);
    indx_DY = strcmp(textdata,['DY',num2str(i)]);
    DY_industry(:,i) = DATA(:,indx_DY==1);
    indx_EP = strcmp(textdata,['EP',num2str(i)]);
    EP_industry(:,i) = DATA(:,indx_EP==1);
    indx_BM = strcmp(textdata,['BM',num2str(i)]);
    BM_industry(:,i) = DATA(:,indx_BM==1);
    indx_NE = strcmp(textdata,['Net_expansion',num2str(i)]);
    NE_industry(:,i) = DATA(:,indx_NE==1);
end

%% Next load in Goyal Welch data
[DATA]=csvread('PredictorData2010.csv',1,0);
DATA(DATA==-999.99) = NaN;

% Set up dates
year=floor(DATA(:,1)/100);
monthpart=DATA(:,1)/100-floor(DATA(:,1)/100);
month=floor(monthpart*100+.00001);
dates=datenum(year,month,ones(length(year),1));

% Assign everything else
Index=DATA(:,2);
Tbill=DATA(:,6);
AAA=DATA(:,7);
BAA=DATA(:,8);
LTY=DATA(:,9);
Rfree=DATA(:,11);
Infl=DATA(:,12);
LTR=DATA(:,13);
Corpr=DATA(:,14);

% Line up data from FF and Compustat/CRSP with Goyal and Welch data
indx_FF_start = find(dates==dates_CRSP(1));
indx_FF_end   = find(dates_CRSP==dates(end));
ind_ret       = [NaN(indx_FF_start-1,size(ind_ret,2));ind_ret(1:indx_FF_end,:)];
DP_industry   = [NaN(indx_FF_start-1,size(ind_ret,2));DP_industry(1:indx_FF_end,:)];
DY_industry   = [NaN(indx_FF_start-1,size(ind_ret,2));DY_industry(1:indx_FF_end,:)];
EP_industry   = [NaN(indx_FF_start-1,size(ind_ret,2));EP_industry(1:indx_FF_end,:)];
BM_industry   = [NaN(indx_FF_start-1,size(ind_ret,2));BM_industry(1:indx_FF_end,:)];
NE_industry   = [NaN(indx_FF_start-1,size(ind_ret,2));NE_industry(1:indx_FF_end,:)];

%% Risk free rate (continuously compounded)
rf = log(1+Rfree);

%% LHS
%Define continuously compounded returns for all 5 industry portfolios

long_sample_er = [NaN;log(1+ind_ret(2:end,ind_no))-log(1+Rfree(1:end-1))];

%% Predictors start here
predictormat=NaN(length(Index),13);
Predictor_names = cell(13,1);
Predictor_names_short = cell(13,1);

% (1) Dividend yield
predictormat(:,1)=log(DY_industry(:,ind_no));
Predictor_names(1) = {'Log dividend yield'};
Predictor_names_short(1) = {'Log(DY)'};

% (2) Earning price ratio
tmp_ep = NaN(size(EP_industry(:,ind_no)));
tmp_ep(EP_industry(:,ind_no) > 0) = log(EP_industry(EP_industry(:,ind_no) > 0,ind_no));
indx_fillin = find(EP_industry(:,ind_no) <= 0);
% Fix few isolated instances where the EP ratio for some industries became
% negative, replacing ln(EP_{t}) with ln(EP_{t-1})
for ii=1:length(indx_fillin)
    tmp_ep(indx_fillin(ii)) = tmp_ep(indx_fillin(ii)-1);
end
disp('Earning price ratios filled in for the following dates:');
disp(datestr(dates(indx_fillin)));

predictormat(:,2) = tmp_ep;
Predictor_names(2) = {'Log earning price ratio'};
Predictor_names_short(2) = {'Log(EP)'};

% (3) log div payout ratio
predictormat(:,3)=log(DP_industry(:,ind_no)) - predictormat(:,2);
Predictor_names(3) = {'Log dividend-payout ratio'};
Predictor_names_short(3) = {'Log(DE)'};

% (4) Book-to-Market
predictormat(:,4)=BM_industry(:,ind_no);
Predictor_names(4) = {'Book-to-market ratio'};
Predictor_names_short(4) = {'BM'};

% (5) T bill rate
predictormat(:,5)=Tbill;
Predictor_names(5) = {'T-Bill rate'};
Predictor_names_short(5) = {'TBL'};

% (6) long term yield
predictormat(:,6)=LTY;
Predictor_names(6) = {'Long-term yield'};
Predictor_names_short(6) = {'LTY'};

% (7) long term return
predictormat(:,7)=LTR;
Predictor_names(7) = {'Long-term return'};
Predictor_names_short(7) = {'LTR'};

% (8) term spread
predictormat(:,8)=LTY - Tbill;
Predictor_names(8) = {'Term spread'};
Predictor_names_short(8) = {'TMS'};

% (9) default yield spread
predictormat(:,9)=BAA - AAA;
Predictor_names(9) = {'Default yield spread'};
Predictor_names_short(9) = {'DFY'};

% (10) default return spread
predictormat(:,10)=Corpr - LTR;
Predictor_names(10) = {'Default return spread'};
Predictor_names_short(10) = {'DFR'};

% (11) stock variance 
% Load the daily Fama French data, value weighted returns 
[DATA_d]=csvread('5_Industry_Portfolios_Daily.csv',1,0);
DATA_d(DATA_d==-999.99) = NaN;

year_d=floor(DATA_d(:,1)/10000);
month_d=floor(100*(DATA_d(:,1)/10000-floor(DATA_d(:,1)/10000)));
dates_FF_d=datenum(year_d,month_d,ones(length(year_d),1));

ind_ret_d = DATA_d(:,2:end)/100;

Svar_new = NaN(size(ind_ret));

for t=indx_FF_start:length(dates)
    [y,m] = datevec(dates(t));
    this_indx = find(year_d==y & month_d == m);
    if ~isempty(this_indx)
        Svar_new(t,:) = sum(ind_ret_d(this_indx,:).^2);
    else
        warning(['No data for ',datestr(dates(t))]);
    end
end


predictormat(:,11)=Svar_new(:,ind_no);
Predictor_names(11) = {'Stock variance'};
Predictor_names_short(11) = {'SVAR'};

% (12) inflation
Infl = Infl;
% fix outlier: 1946.6, replace with average of 1946.5 & 1946.7
Infl(907)=(Infl(906)+Infl(908))/2;
% Note: need to fix inflation series since inflation rate data are released
% in the following month
Infl = [NaN;Infl(1:end-1)];

predictormat(:,12)=Infl;
Predictor_names(12) = {'Inflation'};
Predictor_names_short(12) = {'INFL'};

% (13) net equity expansion
predictormat(:,13)=NE_industry(:,ind_no);
Predictor_names(13) = {'Net equity expansion'};
Predictor_names_short(13) = {'NTIS'};



%% Keep only data from 1926:12 and onward (restricting to CRSP period only)
dates(year<1926)              = [];
month(year<1926)              = [];
long_sample_er(year<1926,:)   = [];
predictormat(year<1926,:)     = [];
rf(year<1926)                 = [];

year(year<1926)               = [];

%% Set up estimation and forecast periods
tb   = NaN*ones(size(predictormat,2),1);
tb_f = NaN*ones(size(predictormat,2),1);
te   = NaN*ones(size(predictormat,2),1);


out_cell = cell(length(Predictor_names)+1,4);
out_cell(1,:) = [{'Predictor'};{'Estimation start date'};{'Forecast start date'};{'Forecast end date'}];

for i=1:size(predictormat,2)
    this_nan = sum(isnan([long_sample_er, [NaN;predictormat(1:end-1,i)]]),2);
    tmp = find(this_nan == 0);
    tb(i,1) = max([min(tmp);find(dates==datenum('2-1-1927'))]);
    tb_f(i,1) = tb(i)+239;
    te(i,1) = max(tmp);
    
    out_cell(i+1,1) = Predictor_names(i);
    out_cell(i+1,2) = cellstr(datestr(dates(tb(i)),'mmmm, yyyy'));
    out_cell(i+1,3) = cellstr(datestr(dates(tb_f(i)),'mmmm, yyyy'));
    out_cell(i+1,4) = cellstr(datestr(dates(te(i)),'mmmm, yyyy'));
end

disp(['Industry ',num2str(ind_no)]);
disp(' ');
disp(out_cell);
disp(' ');


% Set all estimation starts to be the same across all predictors and
% industries (to work with the longest available sample common to all series)
tb   = repmat(max(tb),size(tb,1),1);
tb_f = repmat(max(tb_f),size(tb_f,1),1);
te   = repmat(min(te),size(te,1),1);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Hardcode est start to Jan 1963 and forecast start to Jan 1981
tb   = repmat(find(dates==datenum('1-1-1964')),size(tb,1),1);
tb_f = repmat(find(dates==datenum('1-1-1979')),size(tb_f,1),1);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Save data and print to excel
if ~exist([pwd '\Temp'],'dir')
    mkdir([pwd '\Temp']);
end

% Outexcel data for modeling
out_cell2 = cell(size(predictormat,1)+1,size(predictormat,2)+2);
out_cell2(1,:) = [{'Dates'},{'Excess return (CRSP)'},Predictor_names'];
out_cell2(2:end,1) = num2cell(datenum(year,month,1)- datenum('30-Dec-1899'));
out_cell2(2:end,2:end) = num2cell([long_sample_er predictormat]);

fid = fopen([pwd '\Temp\Data for modeling (industry ',num2str(ind_no),' - monthly).csv'], 'w') ;
fprintf(fid, '%s,', out_cell2{1,1:end-1}) ;
fprintf(fid, '%s\n', out_cell2{1,end}) ;
fclose(fid) ;
dlmwrite([pwd '\Temp\Data for modeling (industry ',num2str(ind_no),' - monthly).csv'], cell2mat(out_cell2(2:end,1:end)), '-append') ;
