; docformat = 'rst'
;+
; :Author: Paulo Penteado (http://www.ppenteado.net), Mar/2015
;-
;+
; :Description:
; Parses table data in a text file (or text array) as an array, with multiple options to specify different
; file formats and processing to be applied to the file. The output can be a string array or an
; array of structures.
;
; :Params:
; file: in, required
; A string with the file name to read. If `buffer` is set, this should be a string
; array, where each element correponds to what would be a file line.
;
; :Keywords:
; header: out, optional
; The header line(s) from the text file, unparsed. The number of header lines
; is set by `nheader`.
; lines: out, optional
; A string array, with one element per line in the input file.
; splitlines: out, optional
; A list where each element is a string array corresponding to one line of the
; input file. Each element in the array is one column from that file line.
; as_struct: in, optional, default=0
; If set, the output is an array of structures, one structure per input line.
; fieldnames: in, out, optional
; The names for the structure fields returned when `as_struct` is set.
; If this is not given, field names are taken from the last line of the file header
; types: in, out, optional
; A hash containing type specifications for each of the structure fields to be created
; when `as_struct` is set. If not given, it will be determined by guessing from the
; file's column contents.
; trim: in, optional, default=2
; Determines the type of leading/trailing trimming to be applied to the file lines. It is
; passed to strtrim, which is applied to all file lines.
; spacedelimited: in, optional, default=0
; If set, the columns are assumed to be separated by any positive number of blank
; spaces. If not set, the columns are assumed to be fixed length, equal to the
; lengths used in the header line.
; skipblank: in, optional, default=0
; If set, blank lines in the file are skipped.
; delimiter: in, optional
; The character(s) used as column delimiter in the file (the columns are split
; with strplit). If not given, the input columns are assumed to be separated by blank space.
; stripquotes: in, optional, default=0
; If set, table elements enclosed in quotes will have the quotes removed.
; isinteger: out, optional
; If provided, will return a list, with one element per column of the file. Each element
; is an array that informs whether the corresponding column element in the input is an integer.
; Most often used for debugging and finding anomalous values in the input.
; isfloat: out, optional
; If provided, will return a list, with one element per column of the file. Each element
; is an array that informs whether the corresponding column element in the input is a float.
; Most often used for debugging and finding anomalous values in the input.
; missingint: in, optional
; If provided, any missing values in columns with integers will be filled with this value.
; missingfloat: in,optional
; If provided, any missing values in columns with floats will be filled with this value.
; blank: in, optional
; Passed to `pp_isnumber`. If set, blank strings are considered valid numbers.
; buffer: in,optional, default=0
; If set, the first argument (`file`) is taken as a string array of the file contents,
; instead of a file name to be read.
; nheader: in, optional, default=1
; The number of header lines contained in the file. If `as_struct` is set and
; field names are not provided, the last line on the header is used to determine
; column names.
;
;
; :Examples:
;
; Read some example files provided with IDL, as structures::
;
; file=filepath('ascii.txt',subdirectory=['examples','data'])
; a=pp_parsetext(file,/skipblank,nheader=4,header=header,delimiter=',',$
; /as_struct,fieldnames=['lon','lat','el','temp','dew','speed','dir'])
; help,a
; ;A STRUCT = -> <Anonymous> Array[15]
; ;help,a[0]
; ;** Structure <4023e918>, 7 tags, length=56, data length=56, refs=2:
; ;LON DOUBLE -156.95000
; ;LAT DOUBLE 20.783300
; ;EL LONG64 399
; ;TEMP LONG64 68
; ;DEW LONG64 64
; ;SPEED LONG64 10
; ;DIR LONG64 60
; print,header
; ;This file contains ASCII format weather data in a comma delimited table with comments prefaced by the "%" character. The columns represent:
; ;Longitude, latitude, elevation (in feet), temperature (in degrees F), dew point (in degrees F), wind speed (knots), wind direction (degrees)
;
; :Requires: `pp_isnumber`, `pp_readtxt`
;
; :Todo:
; Expand documentation, with more examples. This function has received many options
; to be capable of parsing different kinds of text files I encounter, which means
; its options make for a large variety of possibilities in file formats.
;
;
; :Author: Paulo Penteado (http://www.ppenteado.net), Mar/2015
;
function pp_parsetext,file,header=header,lines=lines,splitlines=liness,as_struct=as_struct,$
fieldnames=fieldnames,types=types,trim=trim,spacedelimited=spacedelimited,skipblank=skipblank,$
delimiter=delimiter,stripquotes=stripquotes,isinteger=isinteger,isfloat=isfloat,$
missingint=missingint,missingfloat=missingfloat,blank=blank,buffer=buffer,nheader=nheader
compile_opt idl2,logical_predicate
trim=n_elements(trim) ? trim : 2
spacedelimited=keyword_set(spacedelimited)
stripquotes=keyword_set(stripquotes)
replaceints=n_elements(missingint)
replacefloats=n_elements(missingfloat)
blank=keyword_set(blank)
buffer=keyword_set(buffer)
delimiter=n_elements(delimiter) ? delimiter : !null
if buffer then lines=file else lines=pp_readtxt(file)
if keyword_set(skipblank) then begin
lines=lines[where(strtrim(lines,2) ne '',/null)]
endif
;header=lines[0]
;lines=lines[1:-1]
nheader=n_elements(nheader) ? nheader : 1L
header=nheader ? lines[0:nheader-1] : !null
if header eq !null then begin
ncol=n_elements(strsplit(lines[0],/extract))
header='field_'+strtrim(sindgen(ncol))
endif
lines=lines[nheader:-1]
s=n_elements(delimiter) ? strsplit(header[-1],delimiter) : strsplit(header[-1])
e=[s[1:-1],max(strlen(lines))]
l=e-s
fn=n_elements(delimiter) ? strsplit(header[-1],delimiter,/extract) : strsplit(header[-1],/extract)
liness=spacedelimited ? transpose((strsplit(lines,/extract)).toarray()) : ( n_elements(delimiter) ? transpose((strsplit(lines,delimiter,/extract)).toarray()) : strmid(lines,s,l))
if stripquotes then begin
w=where(stregex(liness,'"(.*)"',/boolean),count)
if count then liness[w]=(stregex(liness[w],'"(.*)"',/subexpr,/extract))[-1,*]
w=where(stregex(fn,'"(.*)"',/boolean),count)
if count then fn[w]=(stregex(fn[w],'"(.*)"',/subexpr,/extract))[-1,*]
endif
;fieldnames=idl_validname(fn,/convert_all)
fieldnames=n_elements(fieldnames) ? idl_validname(fieldnames,/convert_all) : idl_validname(fn,/convert_all)
if trim then liness=strtrim(liness,trim)
isinteger=arg_present(isinteger) ? list() : !null
isfloat=arg_present(isfloat) ? list() : !null
if keyword_set(as_struct) then begin
ret={}
typeh=n_elements(types) ? types[*] : hash()
foreach field,fieldnames,ifield do begin
if ~typeh.haskey(field) then begin
tmpi=reform(pp_isnumber(liness[ifield,*],/integer,blank=(blank or replaceints)))
if replaceints then begin
wi=where(liness[ifield,*] eq '',counti)
liness[ifield,wi]=missingint
endif
if isinteger ne !null then isinteger.add,tmpi
isint=array_equal(minmax(tmpi),[1,1])
tmpf=reform(pp_isnumber(liness[ifield,*],/nan,/infinity,blank=(blank or replacefloats)))
if replacefloats then begin
wf=where(liness[ifield,*] eq '',countf)
liness[ifield,wf]=missingfloat
endif
if isfloat ne !null then isfloat.add,tmpf
isdouble=array_equal(minmax(tmpf),[1,1])
case 1 of
isint: typeh[field]=0LL
isdouble: typeh[field]=0d0
else: typeh[field]=''
endcase
endif
ret=create_struct(ret,field,typeh[field])
endforeach
ret=replicate(ret,n_elements(lines))
foreach field,fieldnames,ifield do begin
ret.(ifield)=reform(liness[ifield,*])
endforeach
endif else ret=liness
return,ret
end