options linesize=80;
options pagesize=60;


/*********   Question 2  *************/


filename costraw "./costs.txt";

data costs;
  infile costraw;
  input id costs days visits year quarter;

  if costs EQ 0 then delete;
  if days EQ 0 then delete;
  if visits EQ 0 then delete;

  /* let's measure costs in millions */
  costs = costs/1000000;

  lnC = log(costs);
  lnD = log(days);
  lnV = log(visits);

  time = (year-1991)*4 + quarter - 1;


proc reg data = costs;
  model costs = days visits time;
  output out=linout pred=C_hat;
  title "linear specification";

/* In the following, outest will contain the
   coefficients from the regression */

proc reg data=costs outest=lncoef;
  model lnC = lnD lnV time;
  output out=lnout pred=lnC_hat residual=e;
  title "log specification";

proc print data=lncoef;
title "coefficients from log specification";

data lncoef;
  set lncoef;

  b_lnD = lnD;
  b_lnV = lnV;
  b_t   = time;
  b_1   = intercept;
  keep b_lnD b_lnV b_t b_1;

/* For problem 2a)

   Let's calculate marginal costs for the lnC
   specification.

   Costs are:

   C = exp(beta1 + beta2*lnD + beta3*lnV + beta4*time + epsilon)

   So, the marginal cost of a day & a visit is:

   dC/dD = C * beta2 / D;

   dC/dV = C * beta3 / V;

   Since these involve error terms, we should take expectations:

   E{dC/dD} = E{C} * beta2 / D;

   E{dC/dV} = E{C} * beta3 / V;

   and our best estimate of E{C} is:

   E{C}-hat = exp(beta1 + beta2*lnD + beta3*lnV + beta4*time)*smear

   where smear is the smearing estimator described in class.

*/

data lnout;
  set lnout;

  smear = exp(e);

  label lnC_hat = " ";

proc means mean data=lnout;
  var smear;
  output out=smearmean mean=smearing;
  title "smearing factor from log specification";

data lnout;
  if _N_ EQ 1 then set smearmean(keep=smearing);
  if _N_ EQ 1 then set lncoef;
  set lnout;

  dCdD = smearing*exp(lnC_hat)*b_lnD/days;
  dCdV = smearing*exp(lnC_hat)*b_lnV/days;

proc means data=lnout;
title "marginal costs from log specification";


/* For 2b), let's do the Ramsey RESET test with 2 terms: */

data linout;
  set linout;

  Ramsey2 = C_hat*C_hat;
  Ramsey3 = C_hat*Ramsey2;

proc reg data=linout;
  model costs = days visits time Ramsey2 Ramsey3;
  title "The Ramsey RESET test regression";

/* For 2d), the SSE from the linear model is the sum
   of squared errors for that model, we just need the
   SSE from the log model.  To find this we need to
   subtract costs-costshat, NOT lncosts-lncostshat!! */

data lnout;
  set lnout;

  Chat = exp(lnC_hat)*smearing;
  error = costs-Chat;
  error2 = error*error;

proc print noobs data=lnout(obs=10);
  var costs Chat lnC_hat smearing error;

proc means data=lnout N mean sum;
  var error2;
  title "SSE for costs in log model";

proc means data=linout;
  var C_hat;


/***************** Question 3 ******************/


filename mepsraw 'MEPSextract1996.txt';

data h_spending;
  infile mepsraw;
  input age sex income employed insured health spending;

  hspoor = health EQ 5;
  hsfair = health EQ 4;
  hsgood = health EQ 3;
  hsvryg = health EQ 2;
  hsexcl = health EQ 1;

  i_poor = insured EQ 1 and health EQ 5;
  i_fair = insured EQ 1 and health EQ 4;
  i_good = insured EQ 1 and health EQ 3;
  i_vryg = insured EQ 1 and health EQ 2;
  i_excl = insured EQ 1 and health EQ 1;


proc reg;
  model spending = income age sex employed insured
        hspoor hsfair hsgood hsvryg / spec ACOV;
  test hsvryg=0;
  title "MEPS spending regression";

proc reg;
  model spending = income age sex employed insured
        hspoor hsfair hsgood hsvryg 
        i_poor i_fair i_good i_vryg / spec ACOV;
  test i_poor=0, i_fair=0, i_good=0, i_vryg=0;


/***************   Question 4  ****************/

data greene248;
  input Y X1 X2;
  cards;
 -1.42  -1.65  -0.67
 -0.26  -0.63  -0.74
 -0.62  -1.78   0.61 
 -1.26  -0.80   1.77
  5.51   0.02   1.87
 -0.35  -0.18   2.01
  2.75   1.48   0.70
 -4.87   0.34  -1.87
  7.01   1.25   2.32
 -0.15  -1.32   2.92
-15.22   0.33  -3.45
 -0.48  -1.62   1.26
  2.10   0.77   0.32
  5.94   0.35   1.56
 26.14   0.22   4.38
  3.41   0.16  -1.94
 -1.47  -1.99  -0.88
  1.24   0.39  -2.02
 -5.08   0.67   2.88
  2.21   0.79   0.37
  7.39   1.25   2.16
 -5.45   1.06   2.09
 -1.48   0.70  -1.53
  0.69   0.17   1.91
  1.49   0.68  -0.19
 -6.87   0.77  -2.07
  0.79  -0.12   1.51
  1.31  -0.60   1.50
  6.66  -0.17   1.42
  1.91   1.02  -2.23
  1.00   0.23  -1.28
  0.90  -1.04   1.20
  1.93   0.66   0.30
  1.52   0.79  -0.46
  1.78   0.33  -2.70
  0.16  -0.40  -2.72
  1.61   0.28   0.26
  1.97   1.06  -0.17
  2.04   0.86   0.19
  2.62   0.48   1.77
 -1.11  -1.13  -0.70
  2.11   0.58  -1.34
-23.17  -0.66   7.82
  3.00   2.04  -0.39
 -5.16   1.90  -1.89
  1.66   0.15  -1.55
 -3.82  -0.41  -2.10
 -2.52  -1.18  -1.15
  6.31  -0.51   1.54
 -4.71  -0.18  -1.85
;

proc means;

proc reg outest=regcoef;
  model Y = X1 X2 / COVB;
  output out=regout residual=e;
  title "Greene 248, problem 5a";

proc reg;
  model Y = X1 X2 / ACOV;
  title "Greene 248, problem 5b";

proc reg;
  model Y = X1 X2 / spec;
  title "Greene 248, problem 5c";

data regout;
  if _N_ EQ 1 then set regcoef(keep=_RMSE_);
  set regout;

  BPaganLHS = e/_RMSE_;
  BPaganLHS = BPaganLHS*BPaganLHS;

proc reg;
  model BPaganLHS = X1 X2;
  title "Greene 248, problem 5d";


/* Let's sort the data by X1 and do G-Q dropping the
   10 middle observations (you could drop a different
   number) */

proc sort data=Greene248;
  by X1;

proc reg data=Greene248(obs=20);
  model Y = X1 X2;
  title "Greene 248, problem 5e - X1 G-Q test";

proc reg data=Greene248(firstobs=31 obs=50);
  model Y = X1 X2;

/* Now for X2 */

proc sort data=Greene248;
  by X2;

proc reg data=Greene248(obs=20);
  model Y = X1 X2;
  title "Greene 248, problem 5e - X2 G-Q test";

proc reg data=Greene248(firstobs=31 obs=50);
  model Y = X1 X2;


/* Since the previous test seemed to show that X2 was
   the X variable which "causes" the heteroskedasticity,
   let's model:

   sigma-squaredi = sigma-squared*(X2i-to-the-alpha)

   or 

   ln(sig-sqi) = ln(sigma-squared) + alpha*ln(X2)

   We can't do this b/c X2 is sometimes negative, so we 
   will:

   ln(sig-sqi) = ln(sigma-squared) + alpha*X2


*/

proc reg data=Greene248;
  model Y = X1 X2;
  output out=olsout residual=e;
  title "Greene 248, problem 6";

data olsout;
  set olsout;

  e2 = e*e;
  lne2 = log(e2);

proc reg data=olsout;
  model lne2 = X2;
  output out=hetero_pred predicted=lne2hat;

data hetero_pred;
  set hetero_pred;

  e2hat = exp(lne2hat);

  Y_weight = Y/sqrt(e2hat);
  X1_weight = X1/sqrt(e2hat);
  X2_weight = X2/sqrt(e2hat);
  const_weight = 1/sqrt(e2hat);

proc reg;
  model Y_weight = const_weight X1_weight X2_weight / noint;