/************************************************************************************
 * program: char2fmt.mac
 * date: 20Aug2005
 * moddate: 06Oct2005
 * programmer: Dan Blanchette
 * comments:  macro to encode character variables to numeric using
 *             user-defined formats to store the character string.
 *           - the format name will be the variable name so any format already
 *              existing with the same name will be over written.
 *           - variables with names that end in numbers are not valid format
 *              names so these formats will be changed to end with an underscore
 *           - sort order of data set will be maintained thus if data set is known
 *              to be sorted by one or more character variables those variables
 *              will not be converted.
 *           - there is no option for specifying a varlist.
 *              
 *
 *  WARNING: This changes your data set by changing some/all character variables
 *            to numeric variables.  Make a back-up copy of your data set
 *            before runing CHAR2FMT.     
 *
 ************************************************************************************
 * Options:
 ************************************************************************************
 * dset= enter name of data set to be modified.  Include libref if you want
 *       to modify a non-work data set. 
 *
 * maxlen= set to what maximum length the character variable can be and not
 *          be converted.  Not setting maxlen will convert all character variables.
 *
 * temp_dir= enter directory name you would like have the file _&tfns_longcharvar.sas
 *            created in if you want to see the code generated by CHAR2FMT,
 *            otherwise the work directory will be used and _&tfns_longcharvar.sas
 *            will be deleted when your SAS session is over.
 *
 * tfns= is for a file name of your choice.  If no set, it will be based on
 *        the SAS &SYSJOBID and &SYSINDEX macro variables.
 *
 ************************************************************************************/

%macro char2fmt(dset= , maxlen= ,  temp_dir=  , tfns=  );

 ** Save option settings so they can be restored at the end of this macro. **;
 %let notes=%sysfunc(getoption(notes));

 %let obs=%sysfunc(getoption(obs));

 options obs=MAX;   *** Reason for maximizing it is because user could have
                      *  set it lower than the number of variables in the dataset. **;
 options nonotes;   *** Shut off notes while program is running in order to reduce log size. **;

 %if &sysver >=9 %then %do;
  options noquotelenmax ;;
 %end;


  proc contents data=work.&dset. out=_conten2 noprint;
  run;

 %let toolong=0; 
 %let maxfilewidth=32300; ** as wide as I am willing to make file **;
 %let maxvarlength=32200; ** this needs to be a little smaller than maxfilewidth **; 

 %let hasvars=0;  ** initialize hasvars **;
  data _conten2;
   set _conten2;
   where type=2 
     %if "&maxlen." ^= ""  %then %do;
      and length > &maxlen.
     %end;
    ;;;
   call symput('hasvars',_n_);
  run;
 
 
  ** if hasvars then do rest of program  **;
 %if (&hasvars. > 0) %then %do;
  
  %if "%nrbquote(&temp_dir)"=""  %then %let temp_dir = %sysfunc(pathname(work));
  %if "&tfns"=""  %then %let tfns= &sysjobid.&sysindex.;

  %let csortedby =;
   data _null_;
    dsid=open("work.&dset","i");
    csortedby=lowcase(attrc(dsid,"SORTEDBY"));
    call symput("csortedby",csortedby);
    rc=close(dsid);
   run;

  %if %length(&csortedby.) = 0  %then %do;
    data work.&dset.;
     set work.&dset.;
      ___ob___ = _n_;  ** create var to preserve sort order **;
    run;
    %let csortedby =___ob___;
  %end;


  ** create formats from long character data **;
  data _null_;
   set _conten2 ;
    where lowcase(name) not in(%sortvars(&csortedby.)); 
      ** end of where statement,  do not want to convert vars needed for sort order *;
   file "%nrbquote(&temp_dir.)_&tfns._longcharvars.sas" ls=&maxfilewidth;
   name=left(name);
   if length(name)> &maxvarlength. then do;
     name=substr(name,1,&maxvarlength.);
     call symput('toolong',1);
   end;
   put " ";
   put " proc sort data=&dset;";
   put "  by " name ";";
   put " run;";
   put " ";
   put " data _conten2;";
   put "  set &dset.(keep= " name ");";
   put "  by " name ";";
   put "  if first." name ";";
   put " run;";
   put " ";
   put " data _conten2;";
   put "  set _conten2;";

   invalidf=0;
   %let vallen=32;
   %if &sysver. < 9 %then %let vallen=8;
   ** check that variable name does not end in a number and if so then add "_" to name **;
   %do i=0 %to 9;
    if  substr(compress(name),length(compress(name)),1)="&i." or name in(%invalid_formats) then do;
     invalidf=1;
     if length(name) < &vallen. then /** add an underscore and period **/ 
        name_f = compress(name || "_");  
     else if length(name) = &vallen. or name in(%invalid_formats) then 
        /** overwrite last character with underscore and add period **/
        name_f = compress(substr(compress(name),1,length(compress(name))-1) || "_"); 
    end;
   %end;
   if invalidf=1 then do;
      put "  fmtname=compress(""" name_f """);";
      name_p = compress(name_f || ".");
   end;
   else do; /** if no change required **/
      put "  fmtname=compress(""" name """);";
      name_p = compress(name || ".");
   end;
   
   put "  start=_n_;";
   put "  ___st___=_n_;";
   put "  end=_n_;";
   put "  label=" name ";";  ** SAS versions 8+ have the same max length for vars and formats *;
   put " run;";
   put " ";

   put " proc format library=work cntlin=_conten2(keep=fmtname start end label);";
   put " run;";
   put " ";
   put " data &dset.(drop= " name " rename=(___st___=" name "));";
   put "  merge &dset.  _conten2(keep=" name " ___st___);";
   put "  by " name ";";
   put "  label " name " = """%nrbquote(label)""";";
   put " run;";
   put " ";
   put " data &dset.;";
   put "  set &dset.;";
   

   put "  format " name " " name_p ";";
   put " run;";
   put " ";
  run;

  %include"&temp_dir._&tfns._longcharvars.sas";
 
  ** return data to original sort oder **;
  proc sort data=&dset.;
   by &csortedby.;
  run;
  %if "&csortedby" = "___ob___" %then %do;
   data &dset;
    set &dset (drop=___ob___);
   run;
  %end;

 %end;  ** of if hasvars to process **;

 %if &toolong=1 %then %do;
    %put WARNING: CHAR2FMT truncated at least one character variable because it was longer than &maxvarlength.  *; 
 %end;
 %goto done;

 %fail1:;
    %put ERROR: CHAR2FMT did not run because you have a least one character variable that is longer than &maxvarlength.  *; 
    %let savastata_err=1;
 %goto done;

 %done:;
 proc datasets nodetails nolist nowarn;
  delete _conten2;
 run;

 options obs=&obs. &notes.;  ** Restore options. **;


%mend char2fmt;


%macro invalid_formats;
   "best"     , "binary"   , "comma"    , "commax"   , "d"        , "date"     , "datetime"
   "dateampm" , "day"      , "ddmmyy"   , "dollar"   , "dollarx"  , "downame"  , "e"       
   "eurdfdd"  , "eurdfde"  , "eurdfdn"  , "eurdfdt"  , "eurdfdwn" , "eurdfmn"  , "eurdfmy" 
   "eurdfwdx" , "eurdfwkx" , "float"    , "fract"    , "hex"      , "hhmm"     , "hour"    
   "ib"       , "ibr"      , "ieee"     , "julday"   , "julian"   , "percent"  , "minguo"  
   "mmddyy"   , "mmss"     , "mmyy"     , "monname"  , "month"    , "monyy"    , "negparen"
   "nengo"    , "numx"     , "octal"    , "pd"       , "pdjulg"   , "pdjuli"   , "pib"     
   "pibr"     , "pk"       , "pvalue"   , "qtr"      , "qtrr"     , "rb"       , "roman"   
   "s370ff"   , "s370fib"  , "s370fibu" , "s370fpd"  , "s370fpdu" , "s370fpib" , "s370frb" 
   "s370fzd"  , "s370fzdl" , "s370fzds" , "s370fzdt" , "s370fzdu" , "ssn"      , "time"    
   "timeampm" , "tod"      , "weekdate" , "weekdatx" , "weekday"  , "worddate" , "worddatx"
   "wordf"    , "words"    , "year"     , "yen"      , "yymm"     , "yymmdd"   , "yymon"   
   "yyq"      , "yyqr"     , "z"        , "zd"       , "f"


%mend invalid_formats;
%macro sortvars (varlist) ;
         %let i=1;
         %do %while(%length(%scan(%cmpres(&varlist.),&i,%str( ))) GT 0); 
               %global var&i.;
               %let var&i.= %scan(%cmpres(&varlist.),&i,%str( ));
               %let i=%eval(&i + 1);
         %end;
         %do j=1 %to %eval(&i-1);
                  %sysfunc(compress(%str(%")&&var&j.%str(%")))
         %end;
%mend sortvars ;