; docformat = 'idl'
; $Id: //depot/Release/ENVI50_IDL82/idl/idldir/lib/read_csv.pro#2 $
;
; Copyright (c) 2008-2012, Exelis Visual Information Solutions, Inc. All
; rights reserved. Unauthorized reproduction is prohibited.
;----------------------------------------------------------------------------
function read_csv_fieldnames, fieldCount
compile_opt idl2, hidden
digits_str = STRTRIM(STRING(STRLEN(STRTRIM(STRING(fieldCount),2))),2)
fstr = '(i' + digits_str + '.' + digits_str + ')'
fieldNames = 'field' + STRING(LINDGEN(fieldCount)+1, FORMAT=fstr)
return, fieldNames
end
;----------------------------------------------------------------------------
;+
; :Description:
; The READ_CSV function reads data from a "comma-separated value"
; (comma-delimited) text file into an IDL structure variable.
;
; This routine handles CSV files consisting of an optional line of column
; headers, followed by columnar data, with commas separating each field.
; Each row is assumed to be a new record.
;
; The READ_CSV routine will automatically return each column (or field)
; in the correct IDL variable type using the following rules:
;
; * Long - All data within that column consists of integers,
; all of which are smaller than the maximum 32-bit integer.
; * Long64 - All data within that column consists of integers,
; with at least one greater than the maximum 32-bit integer.
; * Double - All data within that column consists of numbers, at least
; one of which has either a decimal point or an exponent.
; * String - All data which does not fit into one of the above types.
;
; This routine is written in the IDL language. Its source code can be
; found in the file read_csv.pro in the lib subdirectory of the IDL
; distribution.
;
; :Syntax:
; Result = READ_CSV( Filename
; [, COUNT=variable] [, HEADER=variable] [, MISSING_VALUE=value]
; [, NUM_RECORDS=value] [, RECORD_START=value]
; [, N_TABLE_HEADER=value] [,TABLE_HEADER=variable]
; )
;
; :Params:
; Filename
; A string containing the name of a CSV file to read into an IDL variable.
;
; :Keywords:
; COUNT
; Set this keyword equal to a named variable that will contain the
; number of records read.
;
; HEADER
; Set this keyword equal to a named variable that will contain the
; column headers as a vector of strings. If no header exists,
; an empty scalar string is returned.
;
; MISSING_VALUE
; Set this keyword equal to a value used to replace any missing
; floating-point or integer data. The default value is 0.
;
; NUM_RECORDS
; Set this keyword equal to the number of records to read.
; The default is to read all records in the file.
;
; RECORD_START
; Set this keyword equal to the index of the first record to read.
; The default is the first record of the file (record 0).
;
; N_TABLE_HEADER
; Set this keyword to the number of lines to skip at the beginning of the file,
; not including the HEADER line. These extra lines may be retrieved by using the TABLE_HEADER keyword.
;
; TABLE_HEADER
; Set this keyword to a named variable in which to return an array of strings
; containing the extra table headers at the beginning of the file, as specified by N_TABLE_HEADER.
;
; :History:
; Written, CT, VIS, Oct 2008
; MP, VIS, Oct 2009: Added keyword NSKIP and SKIP_HEADER
;
;-
function read_csv_pp_strings, Filename, $
COUNT=count, $
HEADER=header, $
MISSING_VALUE=missingValue, $
NUM_RECORDS=numRecordsIn, $
RECORD_START=recordStart, $
N_TABLE_HEADER=nTableHeader, $
TABLE_HEADER=tableHeader, $
_EXTRA=_extra,$ ; needed for iOpen
;New parameters, by Paulo Penteado (http://www.ppenteado.net):
types=types,$ ;if provided, assume these types codes for the columns, instead of trying to determine
nan=nan,infinity=infinity,integer=integer,trim=trim,blank=blank,$ ;passed to pp_isnumber when testing columns
rows_for_testing=rows_for_testing ;Maximum number of rows to use when testing columns for data types.
;Set to zero to test all rows
compile_opt idl2, hidden
;ON_ERROR, 2 ;Return on error
CATCH, err
if (err ne 0) then begin
CATCH, /CANCEL
if (N_ELEMENTS(lun) gt 0) then $
FREE_LUN, lun
if (MAX(PTR_VALID(pData)) eq 1) then $
PTR_FREE, pData
MESSAGE, !ERROR_STATE.msg
endif
header = ''
if (N_PARAMS() eq 0) then $
MESSAGE, 'Incorrect number of arguments.'
; Empty file
if (FILE_TEST(filename, /ZERO_LENGTH)) then $
return, 0
;Set appropriate dataStart, where dataStart includes column header.
dataStart = keyword_set(nTableHeader) ? LONG64(nTableHeader) : 0
OPENR, lun, filename, /GET_LUN
str = ''
tableHeader=''
for i=0L, dataStart do begin
READF, lun, str
if i ne dataStart then begin
pos = stregex(str, '"')
if pos ne 0 then begin ; string not enclosed in quotes
pos = stregex(str, ',+'); check for extra commas
if pos ne -1 then str = strmid(str, 0, pos)
endif else begin
; string enclosed in commas
pos = stregex(str, '",+') ; check for extra commas
if pos ne -1 then str = strmid(str, 1, pos-1) else str = strjoin(strsplit(str, '"', /EXTRACT))
endelse
if i eq 0 then tableHeader = str else tableHeader = [tableHeader, str]
endif
endfor
while (STRLEN(STRTRIM(str,2)) eq 0) do begin
READF, lun, str
endwhile
FREE_LUN, lun
; We need to count the number of fields.
; First remove escaped quote characters, which look like "".
str = STRJOIN(STRTOK(str, '""', /REGEX, /EXTRACT))
; Now remove quoted strings, which might contain bogus commas.
str = STRJOIN(STRTOK(str,'"[^"]*"', /REGEX, /EXTRACT))
; Finally, count the number of commas.
fieldCount = N_Elements(STRTOK(str, ',', /PRESERVE_NULL))
fieldNames = Read_CSV_Fieldnames(fieldCount)
template = { $
version: 1.0, $
dataStart: dataStart, $ ; specified as a keyword below
delimiter: BYTE(','), $ ; comma-separated
missingValue: 0, $
commentSymbol: '', $
fieldCount: fieldCount, $
fieldTypes: REPLICATE(7L, fieldCount), $
fieldNames: fieldNames, $
fieldLocations: LONARR(fieldCount), $ ; ignored for csv
fieldGroups: LINDGEN(fieldCount) $ ; ungrouped
}
if (N_Elements(numRecordsIn)) then $
numRecords = numRecordsIn[0] + 1
data = READ_ASCII(filename, /CSV, $
COUNT=count, $
DATA_START=dataStart, $
NUM_RECORDS=numRecords, $
RECORD_START=recordStart, $
TEMPLATE=template)
if (N_TAGS(data) eq 0) then $
MESSAGE, 'File "' + filename + '" is not a valid CSV file.', /NONAME
; Eliminate empty columns
columnLen = LONARR(fieldCount)
firstNonEmptyRow = count - 1
lastNonEmptyRow = 0L
for i=0L,fieldCount-1 do begin
data.(i) = STRTRIM(data.(i), 2)
lengths = STRLEN(data.(i))
good = WHERE(lengths gt 0, ngood)
if (ngood gt 0) then begin
firstNonEmptyRow = firstNonEmptyRow < MIN(good)
lastNonEmptyRow = lastNonEmptyRow > MAX(good)
columnLen[i] = MAX(lengths)
endif
endfor
nColumns = LONG(TOTAL(columnLen gt 0))
; All of the fields were empty.
if (nColumns eq 0) then begin
return, 0
endif
count = lastNonEmptyRow - firstNonEmptyRow + 1
; Convert each field to a pointer, for easier manipulation.
j = 0L
pData = PTRARR(nColumns)
for i=0L,fieldCount-1 do begin
if (columnLen[i] eq 0) then continue
columnLen[j] = columnLen[i]
pData[j] = PTR_NEW((data.(i))[firstNonEmptyRow:lastNonEmptyRow])
j++
endfor
data = 0
columnLen = columnLen[0:nColumns-1]
if (count gt 1) then begin
; Attempt to determine the data types for each field.
if (n_elements(types) eq 1) then types=replicate(types,nColumns)
if (n_elements(types) ne nColumns) then begin
types = BYTARR(nColumns)
rowlimit=n_elements(rows_for_testing) eq 1 ? rows_for_testing : 100
if (rowlimit eq 0) then rowlimit=count-1
for j=0,nColumns-1 do begin
subdata = (*pData[j])[1:(rowlimit < (count-1))]
ON_IOERROR, skip1
tmpDouble = DOUBLE(subdata)
tmpLong64 = LONG64(subdata)
tmpLong = LONG(subdata)
hasDecimal = MAX(STRPOS(subdata, '.')) ge 0
isnumber=total(pp_isnumber(subdata,nan=nan,infinity=infinity,integer=integer,trim=trim,blank=blank)) eq n_elements(subdata)
if ~isnumber then continue
if (hasDecimal || ~ARRAY_EQUAL(tmpLong64, tmpDouble)) then begin
; Double
types[j] = 5
endif else begin
; CR61359: Make sure that our integer data doesn't have any
; non-numeric characters. If so, then just return strings instead.
newLen = STRLEN(STRTRIM(tmpLong64,2))
origLen = STRLEN(subdata)
; Null strings will have been converted to the number zero.
; Set their length back to 0.
newLen[WHERE(origLen eq 0, /NULL)] = 0
if (~ARRAY_EQUAL(newLen, origLen) || ARRAY_EQUAL(newLen,0)) then continue
; Long or Long64
types[j] = ARRAY_EQUAL(tmpLong, tmpLong64) ? 3 : 14
endelse
skip1:
ON_IOERROR, null
endfor
endif
; Attempt to determine if the first line is a header line.
isFirstLineText = 0
for j=0,nColumns-1 do begin
if (types[j] ne 0) then begin
ON_IOERROR, skip2
; If we fail to convert the first item to the type for that column,
; then assume that it is a "string" column header.
result = FIX((*pData[j])[0], TYPE=types[j])
continue
skip2:
ON_IOERROR, null
isFirstLineText = 1
break
endif
endfor
nheader = isFirstLineText ? 1 : 0
fieldNames = Read_CSV_Fieldnames(nColumns)
if (nheader gt 0) then begin
count -= nheader
header = STRARR(nColumns, nheader)
for j=0,nColumns-1 do begin
header[j,*] = (*pData[j])[0:nheader-1]
endfor
endif else begin
; If NUM_RECORDS was specified, we needed to read one extra record,
; in case there was a header. Since there was no header, get rid
; of the extra record.
if (N_Elements(numRecordsIn)) then count--
endelse
hasMissingValue = N_Elements(missingValue) eq 1 && $
missingValue[0] ne 0
; Do the actual type conversion.
for j=0,nColumns-1 do begin
*pData[j] = (*pData[j])[nheader:nheader+count-1]
if (types[j] ne 0) then begin
if (hasMissingValue) then begin
iMiss = WHERE(*pData[j] eq '', nmiss)
endif
ON_IOERROR, skip3
; Do the actual type conversion.
*pData[j] = FIX(*pData[j], TYPE=types[j])
if (hasMissingValue && nmiss gt 0) then begin
(*pData[j])[iMiss] = missingValue[0]
endif
skip3:
ON_IOERROR, null
endif
endfor
endif ; count gt 1
; Create the final anonymous structure.
data = READ_ASCII_CREATE_STRUCT(fieldNames, pData)
PTR_FREE, pData
return, data
end