pp_parsetext.pro (Documentation for pp

; docformat = 'rst'
;+
; :Author: Paulo Penteado (http://www.ppenteado.net), Mar/2015
;-
;+
; :Description:
;    Parses table data in a text file (or text array) as an array, with multiple options to specify different
;    file formats and processing to be applied to the file. The output can be a string array or an
;    array of structures.
;
; :Params:
;   file: in, required
;    A string with the file name to read. If `buffer` is set, this should be a string
;    array, where each element correponds to what would be a file line.
;
; :Keywords:
;    header: out, optional
;      The header line(s) from the text file, unparsed. The number of header lines
;      is set by `nheader`.
;    lines: out, optional
;      A string array, with one element per line in the input file.
;    splitlines: out, optional
;      A list where each element is a string array corresponding to one line of the
;      input file. Each element in the array is one column from that file line.
;    as_struct: in, optional, default=0
;      If set, the output is an array of structures, one structure per input line.
;    fieldnames: in, out, optional
;      The names for the structure fields returned when `as_struct` is set.
;      If this is not given, field names are taken from the last line of the file header
;    types: in, out, optional
;      A hash containing type specifications for each of the structure fields to be created
;      when `as_struct` is set. If not given, it will be determined by guessing from the
;      file's column contents.
;    trim: in, optional, default=2
;      Determines the type of leading/trailing trimming to be applied to the file lines. It is
;      passed to strtrim, which is applied to all file lines.
;    spacedelimited: in, optional, default=0
;      If set, the columns are assumed to be separated by any positive number of blank
;      spaces. If not set, the columns are assumed to be fixed length, equal to the
;      lengths used in the header line.
;    skipblank: in, optional, default=0
;      If set, blank lines in the file are skipped.
;    delimiter: in, optional
;      The character(s) used as column delimiter in the file (the columns are split
;      with strplit). If not given, the input columns are assumed to be separated by blank space.
;    stripquotes: in, optional, default=0
;      If set, table elements enclosed in quotes will have the quotes removed.
;    isinteger: out, optional
;      If provided, will return a list, with one element per column of the file. Each element
;      is an array that informs whether the corresponding column element in the input is an integer.
;      Most often used for debugging and finding anomalous values in the input. 
;    isfloat: out, optional
;      If provided, will return a list, with one element per column of the file. Each element
;      is an array that informs whether the corresponding column element in the input is a float.
;      Most often used for debugging and finding anomalous values in the input.
;    missingint: in, optional
;      If provided, any missing values in columns with integers will be filled with this value.
;    missingfloat: in,optional
;      If provided, any missing values in columns with floats will be filled with this value.
;    blank: in, optional
;      Passed to `pp_isnumber`. If set, blank strings are considered valid numbers.
;    buffer: in,optional, default=0
;      If set, the first argument (`file`) is taken as a string array of the file contents,
;      instead of a file name to be read.
;    nheader: in, optional, default=1
;      The number of header lines contained in the file. If `as_struct` is set and
;      field names are not provided, the last line on the header is used to determine
;      column names.
;      
;
; :Examples:
;
;    Read some example files provided with IDL, as structures::
;
;      file=filepath('ascii.txt',subdirectory=['examples','data'])
;      a=pp_parsetext(file,/skipblank,nheader=4,header=header,delimiter=',',$
;      /as_struct,fieldnames=['lon','lat','el','temp','dew','speed','dir'])
;      help,a
;      ;A               STRUCT    = -> <Anonymous> Array[15]
;      ;help,a[0]
;      ;** Structure <4023e918>, 7 tags, length=56, data length=56, refs=2:
;      ;LON             DOUBLE          -156.95000
;      ;LAT             DOUBLE           20.783300
;      ;EL              LONG64                       399
;      ;TEMP            LONG64                        68
;      ;DEW             LONG64                        64
;      ;SPEED           LONG64                        10
;      ;DIR             LONG64                        60
;      print,header
;      ;This file contains ASCII format weather data in a comma delimited table with comments prefaced by the "%" character. The columns represent:
;      ;Longitude, latitude, elevation (in feet), temperature (in degrees F),  dew point (in degrees  F), wind speed (knots), wind direction (degrees)
;
; :Requires: `pp_isnumber`, `pp_readtxt`
;
; :Todo:
;   Expand documentation, with more examples. This function has received many options
;   to be capable of parsing different kinds of text files I encounter, which means
;   its options make for a large variety of possibilities in file formats.
;
;
; :Author: Paulo Penteado (http://www.ppenteado.net), Mar/2015
;
function pp_parsetext,file,header=header,lines=lines,splitlines=liness,as_struct=as_struct,$
fieldnames=fieldnames,types=types,trim=trim,spacedelimited=spacedelimited,skipblank=skipblank,$
delimiter=delimiter,stripquotes=stripquotes,isinteger=isinteger,isfloat=isfloat,$
missingint=missingint,missingfloat=missingfloat,blank=blank,buffer=buffer,nheader=nheader
compile_opt idl2,logical_predicate
trim=n_elements(trim) ? trim : 2
spacedelimited=keyword_set(spacedelimited)
stripquotes=keyword_set(stripquotes)
replaceints=n_elements(missingint)
replacefloats=n_elements(missingfloat)
blank=keyword_set(blank)
buffer=keyword_set(buffer)
delimiter=n_elements(delimiter) ? delimiter : !null

if buffer then lines=file else lines=pp_readtxt(file)
if keyword_set(skipblank) then begin
  lines=lines[where(strtrim(lines,2) ne '',/null)]
endif
;header=lines[0]
;lines=lines[1:-1]
nheader=n_elements(nheader) ? nheader : 1L
header=nheader ? lines[0:nheader-1] : !null
if header eq !null then begin
  ncol=n_elements(strsplit(lines[0],/extract))
  header='field_'+strtrim(sindgen(ncol))
endif
lines=lines[nheader:-1]
s=n_elements(delimiter) ? strsplit(header[-1],delimiter) : strsplit(header[-1])
e=[s[1:-1],max(strlen(lines))]
l=e-s
fn=n_elements(delimiter) ? strsplit(header[-1],delimiter,/extract) : strsplit(header[-1],/extract)

liness=spacedelimited ? transpose((strsplit(lines,/extract)).toarray()) : ( n_elements(delimiter) ? transpose((strsplit(lines,delimiter,/extract)).toarray()) : strmid(lines,s,l))
if stripquotes then begin
  w=where(stregex(liness,'"(.*)"',/boolean),count)
  if count then liness[w]=(stregex(liness[w],'"(.*)"',/subexpr,/extract))[-1,*]
  w=where(stregex(fn,'"(.*)"',/boolean),count)
  if count then fn[w]=(stregex(fn[w],'"(.*)"',/subexpr,/extract))[-1,*]
endif
;fieldnames=idl_validname(fn,/convert_all)
fieldnames=n_elements(fieldnames) ? idl_validname(fieldnames,/convert_all) : idl_validname(fn,/convert_all)
if trim then liness=strtrim(liness,trim)
isinteger=arg_present(isinteger) ? list() : !null
isfloat=arg_present(isfloat) ? list() : !null
if keyword_set(as_struct) then begin
  ret={}
  typeh=n_elements(types) ? types[*] : hash()
  foreach field,fieldnames,ifield do begin
    if ~typeh.haskey(field) then begin
      tmpi=reform(pp_isnumber(liness[ifield,*],/integer,blank=(blank or replaceints)))
      if replaceints then begin
        wi=where(liness[ifield,*] eq '',counti)
        liness[ifield,wi]=missingint
      endif
      if isinteger ne !null then isinteger.add,tmpi
      isint=array_equal(minmax(tmpi),[1,1])
      tmpf=reform(pp_isnumber(liness[ifield,*],/nan,/infinity,blank=(blank or replacefloats)))
      if replacefloats then begin
        wf=where(liness[ifield,*] eq '',countf)
        liness[ifield,wf]=missingfloat
      endif
      if isfloat ne !null then isfloat.add,tmpf
      isdouble=array_equal(minmax(tmpf),[1,1])
      case 1 of
        isint: typeh[field]=0LL
        isdouble: typeh[field]=0d0
        else: typeh[field]=''
      endcase
    endif
    ret=create_struct(ret,field,typeh[field])
  endforeach
  ret=replicate(ret,n_elements(lines))
  foreach field,fieldnames,ifield do begin
    ret.(ifield)=reform(liness[ifield,*])
  endforeach
endif else ret=liness
return,ret
end