program define chunky, sclass
version 9

*! version 1.0.0  2008.04.26
*! version 1.0.1  2009.01.20
*! version 1.2.0  2010.06.10 Changed syntax and processes, better error-handling
*! version 2.0.0  2010.08.21 Major upgrade to use Mata file I/O routines
*! version 2.0.1  2010.08.27 Version submitted to SSC
*!
*! by David C. Elliott
*! Text file chunking algorithm
*!
*! syntax:
*! using filename
*! chunksize() is is the size of the chunk to be read
*!   the size can be specified in bytes, kilobytes (k|kb), megabytes (m|mb) or gigabytes (g|gb)
*!   the power of 10 suffix is case-insensitive and can have a space after the number.  Decimals can be used.
*!   e.g. 5000Kb = 5m = .005 GB
*! stub() is file name stub for the chunks to be saved,
*!   defaults to chunk and will create chunk000#.txt...
*! replace allows overwriting of previous chunk files
*! header(include|skip|none) specifies how to handle the first line of the file
*!   useful when the first line contains variable names
*! peek(#) displays first n lines to screen
*! analyze checks using file for problem characters, etc, and provides table of chunk sizes
*! note - this routine works on text files only

syntax using/ [, ///
	Peek(numlist max=1 >0 integer) ///
	Analyze ///
	Chunksize(string) ///
	Stub(string) REPLACE ///
	Header(string)]

tempname sheader sfilelist schunksize sreplace

*set trace on
*set tracedepth 1
local infile `using'
scalar `sreplace' = cond("`replace'"=="replace",1,0)

file close _all
/* debugging !del chunk0*.txt*/

if "`peek'" != "" { // peek takes precedence over any other option
	mata: peek(`"`infile'"', `peek')
	}
if "`analyze'" != "" { // analyze takes next precedence
	_analyze `"`infile'"'
	}
if ("`peek'" != "" | "`analyze'" != "") & "`chunksize'"!="" {
	di _n "{err: Warning: peek() and analyse options overide chunking options.}" ///
	_n "{err: Chunking was not performed.}"
	}
if ("`peek'" != "" | "`analyze'" != "") {
	exit
	}

if `"`stub'"'=="" {  // check chunk stubname, default to "chunk" if none
	local stub chunk
	}

// Parse out chunksize
// use regular expression to parse out base and coefficient of chunksize
if regexm(trim("`chunksize'"),"([0-9]*[.]?[0-9]*)[ ]*([kKmMgG]?)[bB]?") != 0 {
	scalar `schunksize' = `=regexs(1)' * cond("`=regexs(2)'" !="", 1000^(strpos("KMG",upper("`=regexs(2)'"))), 1)
	}
	else {
		scalar `schunksize' = 100000000  // Default to 100MB
		}

// Determine header processing
if "`header'"=="" { 	//parse out header command
	scalar `sheader' = 1
	}
	else {
		if `: word count `header'' > 1 {
			di "{err:too many header options}" _n ///
			"{err:valid options are: header(include|skip|none)}"
			error
			}
		local 0 ,`header'
		capture syntax [,None Include Skip]
		if _rc {
			di "{err:{res:`=substr("`0'",2,.)' }is not a header option}" _n ///
			"{err:valid options are: header(include|skip|none)}"
			error
			}
		if "`none'" == "none" {
			scalar `sheader' = 1
			}
			else if "`include'" == "include" {
				scalar `sheader' = 2
				}
				else {
					scalar `sheader' = 3
					}
		}

mata: mata set matalnum on /*for debugging*/
mata: mata set matastrict on
mata: st_local("`sfilelist'",chunkfile(`"`infile'"', `=`schunksize'', "`stub'", `=`sheader'',`=`sreplace''))

sreturn local filelist = `"``sfilelist''"'

end

***************
* subroutines *
***************

program define _analyze
	args infile
	di _n `"{txt:Analyzing {res:`infile'} for chunking}"' _n
	quietly hexdump `"`infile'"' , analyze results

// set up scalars
tempname s_av_line_len s_max_line_len s_letters s_max_letters s_numbers s_remainder s_pct_char s_pct_num s_pct_other s_mem s_stata_size
local format `r(format)'
local extended 		`=cond(r(extended)>0,"{err:Extended characters are present and may cause problems.}","No extended characters present.")'
scalar `s_av_line_len' 	= round(r(filesize)/r(lnum),1)
scalar `s_max_line_len'	= r(lmax)
scalar `s_letters' 		= r(uc) + r(lc)
scalar `s_max_letters'	= round(`s_letters'*(`s_max_line_len'/`s_av_line_len'),1)
scalar `s_numbers' 		= r(digit)
scalar `s_remainder' 	= r(filesize) - (`s_letters' + `s_numbers')
scalar `s_pct_char'		= round(`s_letters'/r(filesize),.01)*100
scalar `s_pct_num'		= round(`s_numbers'/r(filesize),.01)*100
scalar `s_pct_other'		= round(`s_remainder'/r(filesize),.01)*100
scalar `s_mem' 			= c(memory)
scalar `s_stata_size'	= round(`s_max_letters' + `s_numbers'/1.5,1)

n di "{txt:{res:`format'} is the file type}"
n di "{txt:File has {res:`r(lnum)'} lines of average length {res:`=`s_av_line_len''} bytes}"
n di "{txt:Composition is {res:`=`s_pct_char''%} letters, {res:`=`s_pct_num''%} numbers and {res:`=`s_pct_other''%} other characters}"
n di "{txt:`extended'}"
 if r(extended)>0 {
	local codelist "000 \0" "001 ^A" "002 ^B" "003 ^C" "004 ^D" "005 ^E" "006 ^F" "007 ^G" "008 ^H" "009 \t" "010 \n" "011 ^K" "012 ^L" "013 \r" "014 ^N" "015 ^O" "016 ^P" "017 ^Q" "018 ^R" "019 ^S" "020 ^T" "021 ^U" "022 ^V" "023 ^W" "024 ^X" "025 ^Y" "026 ^Z" "027 Es" "028 FS" "029 GS" "030 RS" "031 US"
	n di _n "{txt:Extended characters found:}"
	n di "{txt:{c TLC}{dup 6:{c -}}{c TT}{dup 10:{c -}}{c TRC}}"
	n di "{txt:{c |}{center 6:ASCII}{c |}{center 10:count}{c |}}"
	n di "{txt:{c LT}{dup 6:{c -}}{c +}{dup 10:{c -}}{c RT}}"

	forvalues ascii = 0/31 {
		if `=r(c`ascii')' > 0 & !inlist(`ascii',10,13) {
			di "{txt:{c |}`:word `=`ascii'+1' of "`codelist'"'{c |}{res:{ralign 10:`=r(c`ascii')'}}{c |}}"
			}
		}
	forvalues ascii =  161/254 {
		if `=r(c`ascii')' > 0 & !inlist(`ascii',10,13) {
			di "{txt:{c |}{lalign 6:`=substr("000`ascii'",-3,3)'}{c |}{res:{ralign 10:`=r(c`ascii')'}}{c |}}"
			}
		}
	n di "{txt:{c BLC}{dup 6:{c -}}{c BT}{dup 10:{c -}}{c BRC}}"
 	}

n di _n"{txt:Approximate chunk sizes and memory requirements {c 10}for -{help insheet:insheet}- or -{help infile:infile}- commands}"

n di "{txt:{c TLC}{dup 3:{dup 14:{c -}}{c TT}}{dup 14:{c -}}{c TRC}}"
n di "{txt:{c |}{center 14:Chunksize (mb)}{c |}{center 14:Number of}{c |}{center 14:~Number}{c |}{center 14:Stata size*}{c |}}"
n di "{txt:{c |}{center 14:option}{c |}{center 14:Chunks}{c |}{center 14:obs/chunk}{c |}{center 14:(megabytes)}{c |}}"
n di "{txt:{c LT}{dup 3:{dup 14:{c -}}{c +}}{dup 14:{c -}}{c RT}}"
foreach s of numlist 10 30 100 300 1000 3000 {
	local num_chunks 	= ceil(r(filesize)/(`s'*1000000))
	local num_lines		= round(r(lnum)/`num_chunks',1)
	local stata_chunk	= round((`s_stata_size'/`num_chunks')/1000000,.1)
	n di "{txt:{c |}{ralign 12:`s'}  {c |}{ralign 12:{res:`num_chunks'}}  {c |}{ralign 12:{res:`num_lines'}}  {c |}{ralign 12:{res:`stata_chunk'}}  {c |}}"
	if `num_chunks' == 1 {
		continue, break
		}
	}
n di "{txt:{c BLC}{dup 3:{dup 14:{c -}}{c BT}}{dup 14:{c -}}{c BRC}}"
n di "{txt:* Stata file size is very approximate and depends on datatypes of variables}"
n di _n `"{txt:Further detail available by running {stata `"hexdump `"`infile'"', analyze results"':hexdump `"`infile'"', analyze results}}"'

end

*****************
* Mata routines *
*****************

version 9
mata:
mata clear

string function chunkfile(
	string scalar infile,
	scalar chunksize,
	string scalar stub,
	scalar header,
	scalar replace
	)
	{
	real scalar mem, bites, bitesize, stop, fh_in, fh_out
	string scalar headertext, chunk_name, chunk_list, bigbite, littlebite, bite

// determine bitesize based on requested chunksize and available memory
	mem = c("memory")
//	mem = 25000
	bites = ceil(chunksize/mem)
	bitesize = trunc(chunksize/bites)
	printf("{txt}\nChunking using the following settings:\n\nChunksize:{col 13}{res}%15.0gc\n{txt}Memory:{col 13}{res}%15.0gc\n{txt}Bites:{col 13}{res}%15.0gc\n{txt}Bitesize:{col 13}{res}%15.0gc\n\n",chunksize, mem , bites, bitesize)
	displayflush()

	fh_in = fopen(infile, "r")
	if (header != 1) {	/*1 means no header - do nothing*/
		if ( (headertext=fgetnl(fh_in))!=J(0,0,"") ) {
			printf("{txt}%s header: {res}",( (header==2) ? "Include" : "Skip" ))  /*2 and 3 means there is a header*/
			headertext
			printf("{txt}\n(for reference: EOL characters {it:0d0a} (CRLF) indicate Windows, {it:0a} (LF) Unix and {it:0d} (CR) Mac. {it:09} is the TAB character.)\n\n")

			}
			else { /*read problem*/
				errprintf("Cannot read header")
				error
				}
		}

	stop = 0 					// flag for end of file
	n_chunk = 0 			// chunk numbering
	chunk_list = ""		// to accumulate list of filenames

// loop through chunks and bites
	while (stop == 0) {
		chunk_name = stub+substr("000"+strofreal(++n_chunk),-4,4)+".txt"
		chunk_list = chunk_list + "`" + `"""' + chunk_name + `"""' + "' "
		if ( (fileexists(chunk_name)==1) & (replace==1) ) unlink(chunk_name)
		fh_out = fopen(chunk_name, "w")
		if (header == 2) fwrite(fh_out, headertext)
		for (n_bite=1; n_bite <= bites; n_bite++) {
			if ( (bigbite = fread(fh_in, bitesize))!=J(0,0,"") & (stop!=1) ) {
				if ( (littlebite=fgetnl(fh_in))!=J(0,0,"")) {
					}
					else {
						stop = 1
						}
  				bite = (bigbite != J(0,0,"") ? bigbite : "" ) + (littlebite != J(0,0,"") ? littlebite : "" )
  				fwrite(fh_out, bite)
  				if (stop==1) break
				}
				else {
					break
					}
			}
		fclose(fh_out)
printf("{txt}Chunk{res} %s {txt}saved. Now at position{res} %-16.0gc {err}%s\n", chunk_name ,  ftell(fh_in), ((stop==1) ? "End of File" : ""))
		if (fstatus(fh_in)!=0) break
	}
	fclose(fh_in)
	printf("\n")
	return(chunk_list) // send list of filenames back to caller
}

end

mata:
void function peek(
	string scalar infile,
	scalar peek
	)
{
	printf("{txt}\nPeeking at the first {res}%f {txt}lines of {res}%s\n\n", peek, infile)
	fh_in = fopen(infile, "r")
	for (n_lines=1; n_lines <= peek; n_lines++) {
		sprintf("%s",fgetnl(fh_in))
	}
	printf("{txt}\n(for reference: End of line characters {res}{it:odoa} {txt}(CRLF) indicate Windows, {res}{it:oa} {txt}(LF) Unix and {res}{it:od} {txt}(CR) Mac. {c 10}{res}{it:09} {txt}is the TAB character.)\n\n" )
}
end

exit