; docformat = 'idl' ; $Id: //depot/Release/ENVI50_IDL82/idl/idldir/lib/read_csv.pro#2 $ ; ; Copyright (c) 2008-2012, Exelis Visual Information Solutions, Inc. All ; rights reserved. Unauthorized reproduction is prohibited. ;---------------------------------------------------------------------------- function read_csv_fieldnames, fieldCount compile_opt idl2, hidden digits_str = STRTRIM(STRING(STRLEN(STRTRIM(STRING(fieldCount),2))),2) fstr = '(i' + digits_str + '.' + digits_str + ')' fieldNames = 'field' + STRING(LINDGEN(fieldCount)+1, FORMAT=fstr) return, fieldNames end ;---------------------------------------------------------------------------- ;+ ; :Description: ; The READ_CSV function reads data from a "comma-separated value" ; (comma-delimited) text file into an IDL structure variable. ; ; This routine handles CSV files consisting of an optional line of column ; headers, followed by columnar data, with commas separating each field. ; Each row is assumed to be a new record. ; ; The READ_CSV routine will automatically return each column (or field) ; in the correct IDL variable type using the following rules: ; ; * Long - All data within that column consists of integers, ; all of which are smaller than the maximum 32-bit integer. ; * Long64 - All data within that column consists of integers, ; with at least one greater than the maximum 32-bit integer. ; * Double - All data within that column consists of numbers, at least ; one of which has either a decimal point or an exponent. ; * String - All data which does not fit into one of the above types. ; ; This routine is written in the IDL language. Its source code can be ; found in the file read_csv.pro in the lib subdirectory of the IDL ; distribution. ; ; :Syntax: ; Result = READ_CSV( Filename ; [, COUNT=variable] [, HEADER=variable] [, MISSING_VALUE=value] ; [, NUM_RECORDS=value] [, RECORD_START=value] ; [, N_TABLE_HEADER=value] [,TABLE_HEADER=variable] ; ) ; ; :Params: ; Filename ; A string containing the name of a CSV file to read into an IDL variable. ; ; :Keywords: ; COUNT ; Set this keyword equal to a named variable that will contain the ; number of records read. ; ; HEADER ; Set this keyword equal to a named variable that will contain the ; column headers as a vector of strings. If no header exists, ; an empty scalar string is returned. ; ; MISSING_VALUE ; Set this keyword equal to a value used to replace any missing ; floating-point or integer data. The default value is 0. ; ; NUM_RECORDS ; Set this keyword equal to the number of records to read. ; The default is to read all records in the file. ; ; RECORD_START ; Set this keyword equal to the index of the first record to read. ; The default is the first record of the file (record 0). ; ; N_TABLE_HEADER ; Set this keyword to the number of lines to skip at the beginning of the file, ; not including the HEADER line. These extra lines may be retrieved by using the TABLE_HEADER keyword. ; ; TABLE_HEADER ; Set this keyword to a named variable in which to return an array of strings ; containing the extra table headers at the beginning of the file, as specified by N_TABLE_HEADER. ; ; :History: ; Written, CT, VIS, Oct 2008 ; MP, VIS, Oct 2009: Added keyword NSKIP and SKIP_HEADER ; ;- function read_csv_pp_strings, Filename, $ COUNT=count, $ HEADER=header, $ MISSING_VALUE=missingValue, $ NUM_RECORDS=numRecordsIn, $ RECORD_START=recordStart, $ N_TABLE_HEADER=nTableHeader, $ TABLE_HEADER=tableHeader, $ _EXTRA=_extra,$ ; needed for iOpen ;New parameters, by Paulo Penteado (http://www.ppenteado.net): types=types,$ ;if provided, assume these types codes for the columns, instead of trying to determine nan=nan,infinity=infinity,integer=integer,trim=trim,blank=blank,$ ;passed to pp_isnumber when testing columns rows_for_testing=rows_for_testing ;Maximum number of rows to use when testing columns for data types. ;Set to zero to test all rows compile_opt idl2, hidden ;ON_ERROR, 2 ;Return on error CATCH, err if (err ne 0) then begin CATCH, /CANCEL if (N_ELEMENTS(lun) gt 0) then $ FREE_LUN, lun if (MAX(PTR_VALID(pData)) eq 1) then $ PTR_FREE, pData MESSAGE, !ERROR_STATE.msg endif header = '' if (N_PARAMS() eq 0) then $ MESSAGE, 'Incorrect number of arguments.' ; Empty file if (FILE_TEST(filename, /ZERO_LENGTH)) then $ return, 0 ;Set appropriate dataStart, where dataStart includes column header. dataStart = keyword_set(nTableHeader) ? LONG64(nTableHeader) : 0 OPENR, lun, filename, /GET_LUN str = '' tableHeader='' for i=0L, dataStart do begin READF, lun, str if i ne dataStart then begin pos = stregex(str, '"') if pos ne 0 then begin ; string not enclosed in quotes pos = stregex(str, ',+'); check for extra commas if pos ne -1 then str = strmid(str, 0, pos) endif else begin ; string enclosed in commas pos = stregex(str, '",+') ; check for extra commas if pos ne -1 then str = strmid(str, 1, pos-1) else str = strjoin(strsplit(str, '"', /EXTRACT)) endelse if i eq 0 then tableHeader = str else tableHeader = [tableHeader, str] endif endfor while (STRLEN(STRTRIM(str,2)) eq 0) do begin READF, lun, str endwhile FREE_LUN, lun ; We need to count the number of fields. ; First remove escaped quote characters, which look like "". str = STRJOIN(STRTOK(str, '""', /REGEX, /EXTRACT)) ; Now remove quoted strings, which might contain bogus commas. str = STRJOIN(STRTOK(str,'"[^"]*"', /REGEX, /EXTRACT)) ; Finally, count the number of commas. fieldCount = N_Elements(STRTOK(str, ',', /PRESERVE_NULL)) fieldNames = Read_CSV_Fieldnames(fieldCount) template = { $ version: 1.0, $ dataStart: dataStart, $ ; specified as a keyword below delimiter: BYTE(','), $ ; comma-separated missingValue: 0, $ commentSymbol: '', $ fieldCount: fieldCount, $ fieldTypes: REPLICATE(7L, fieldCount), $ fieldNames: fieldNames, $ fieldLocations: LONARR(fieldCount), $ ; ignored for csv fieldGroups: LINDGEN(fieldCount) $ ; ungrouped } if (N_Elements(numRecordsIn)) then $ numRecords = numRecordsIn[0] + 1 data = READ_ASCII(filename, /CSV, $ COUNT=count, $ DATA_START=dataStart, $ NUM_RECORDS=numRecords, $ RECORD_START=recordStart, $ TEMPLATE=template) if (N_TAGS(data) eq 0) then $ MESSAGE, 'File "' + filename + '" is not a valid CSV file.', /NONAME ; Eliminate empty columns columnLen = LONARR(fieldCount) firstNonEmptyRow = count - 1 lastNonEmptyRow = 0L for i=0L,fieldCount-1 do begin data.(i) = STRTRIM(data.(i), 2) lengths = STRLEN(data.(i)) good = WHERE(lengths gt 0, ngood) if (ngood gt 0) then begin firstNonEmptyRow = firstNonEmptyRow < MIN(good) lastNonEmptyRow = lastNonEmptyRow > MAX(good) columnLen[i] = MAX(lengths) endif endfor nColumns = LONG(TOTAL(columnLen gt 0)) ; All of the fields were empty. if (nColumns eq 0) then begin return, 0 endif count = lastNonEmptyRow - firstNonEmptyRow + 1 ; Convert each field to a pointer, for easier manipulation. j = 0L pData = PTRARR(nColumns) for i=0L,fieldCount-1 do begin if (columnLen[i] eq 0) then continue columnLen[j] = columnLen[i] pData[j] = PTR_NEW((data.(i))[firstNonEmptyRow:lastNonEmptyRow]) j++ endfor data = 0 columnLen = columnLen[0:nColumns-1] if (count gt 1) then begin ; Attempt to determine the data types for each field. if (n_elements(types) eq 1) then types=replicate(types,nColumns) if (n_elements(types) ne nColumns) then begin types = BYTARR(nColumns) rowlimit=n_elements(rows_for_testing) eq 1 ? rows_for_testing : 100 if (rowlimit eq 0) then rowlimit=count-1 for j=0,nColumns-1 do begin subdata = (*pData[j])[1:(rowlimit < (count-1))] ON_IOERROR, skip1 tmpDouble = DOUBLE(subdata) tmpLong64 = LONG64(subdata) tmpLong = LONG(subdata) hasDecimal = MAX(STRPOS(subdata, '.')) ge 0 isnumber=total(pp_isnumber(subdata,nan=nan,infinity=infinity,integer=integer,trim=trim,blank=blank)) eq n_elements(subdata) if ~isnumber then continue if (hasDecimal || ~ARRAY_EQUAL(tmpLong64, tmpDouble)) then begin ; Double types[j] = 5 endif else begin ; CR61359: Make sure that our integer data doesn't have any ; non-numeric characters. If so, then just return strings instead. newLen = STRLEN(STRTRIM(tmpLong64,2)) origLen = STRLEN(subdata) ; Null strings will have been converted to the number zero. ; Set their length back to 0. newLen[WHERE(origLen eq 0, /NULL)] = 0 if (~ARRAY_EQUAL(newLen, origLen) || ARRAY_EQUAL(newLen,0)) then continue ; Long or Long64 types[j] = ARRAY_EQUAL(tmpLong, tmpLong64) ? 3 : 14 endelse skip1: ON_IOERROR, null endfor endif ; Attempt to determine if the first line is a header line. isFirstLineText = 0 for j=0,nColumns-1 do begin if (types[j] ne 0) then begin ON_IOERROR, skip2 ; If we fail to convert the first item to the type for that column, ; then assume that it is a "string" column header. result = FIX((*pData[j])[0], TYPE=types[j]) continue skip2: ON_IOERROR, null isFirstLineText = 1 break endif endfor nheader = isFirstLineText ? 1 : 0 fieldNames = Read_CSV_Fieldnames(nColumns) if (nheader gt 0) then begin count -= nheader header = STRARR(nColumns, nheader) for j=0,nColumns-1 do begin header[j,*] = (*pData[j])[0:nheader-1] endfor endif else begin ; If NUM_RECORDS was specified, we needed to read one extra record, ; in case there was a header. Since there was no header, get rid ; of the extra record. if (N_Elements(numRecordsIn)) then count-- endelse hasMissingValue = N_Elements(missingValue) eq 1 && $ missingValue[0] ne 0 ; Do the actual type conversion. for j=0,nColumns-1 do begin *pData[j] = (*pData[j])[nheader:nheader+count-1] if (types[j] ne 0) then begin if (hasMissingValue) then begin iMiss = WHERE(*pData[j] eq '', nmiss) endif ON_IOERROR, skip3 ; Do the actual type conversion. *pData[j] = FIX(*pData[j], TYPE=types[j]) if (hasMissingValue && nmiss gt 0) then begin (*pData[j])[iMiss] = missingValue[0] endif skip3: ON_IOERROR, null endif endfor endif ; count gt 1 ; Create the final anonymous structure. data = READ_ASCII_CREATE_STRUCT(fieldNames, pData) PTR_FREE, pData return, data end