ADDED csv-xml/csv-out.impl Index: csv-xml/csv-out.impl ================================================================== --- /dev/null +++ csv-xml/csv-out.impl @@ -0,0 +1,261 @@ +;;;; cvs-out.impl -*- Hen -*- +;;;; Kon Lovett, Jun '17 + +;;;; *** included source file *** + +;;Issues +;; +;;- missing explicit types for exports; too much '*' type + +;; + +(define-constant CRLF-STR "\r\n") +(define-constant LF-STR "\n") +(define-constant CR-STR "\r") ;old MacOS + +(define *system-newline* + (cond-expand + (windows + CRLF-STR ) + (unix + LF-STR ) + (else + LF-STR ) ) ) + +(define-constant +newline-char-default+ #t) ;#t - | #\n | ... +(define-constant +separator-char-default+ #\,) +(define-constant +quote-char-default+ #\") ;#f | #\" | ... +(define-constant +comment-char-default+ #\#) ;#f | #\# | ... +(define-constant +quote-doubling-escapes?-default+ #t) +(define-constant +quote-controls?-default+ #t) +(define-constant +always-quote?-default+ #t) + +#| +(define-constant +sxml-top-symbol+ '|*TOP*|) +(define-constant +sxml-row-element-default+ 'row) +(define-constant +sxml-col-elements-limit-default+ 32) ; arbitrary (see csv.ss) +|# + +;; + +;very loose : newline-char | separator-char | quote-char +;see "csv-xml.scm" +(define csv-writer-spec? alist?) +(define-check+error-type csv-writer-spec) + +(define csv-writer? procedure?) +(define-check+error-type csv-writer) + +;; + +(define *default-writer-spec* (writer-spec-with-defaults '())) + +(define (list->csv ls #!optional (writer-or-out (current-output-port))) + (let ( + (writer + (cond + ((csv-writer? writer-or-out) + writer-or-out ) + ((output-port? writer-or-out) + (make-csv-line-writer 'list->csv writer-or-out *default-writer-spec*) ) + (else + (error 'list->csv "invalid csv-writer or output-port" writer-or-out) ) ) ) ) + (for-each writer ls) ) ) + +#| +;; + +(define (list->sxml ls + #!optional + (row-element (sxml-row-element-default)) + (column-elements (sxml-col-elements-default)) + (writer-spec *default-writer-spec*)) + (append! + `(,(sxml-top-symbol)) + (map (cut list->sxml-element <> row-element column-elements writer-spec) ls)) ) +|# + +;; + +(define (writer-spec + #!key + (newline-char +newline-char-default+) + (separator-char +separator-char-default+) + (quote-char +quote-char-default+) + (comment-char +comment-char-default+) + (quote-doubling-escapes? +quote-doubling-escapes?-default+) + (quote-controls? +quote-controls?-default+) + (always-quote? +always-quote?-default+)) + ;FIXME checking the input types + `((newline-char . ,newline-char) + (separator-char . ,separator-char) + (quote-char . ,quote-char) + (comment-char . ,comment-char) + (quote-doubling-escapes? . ,quote-doubling-escapes?) + (quote-controls? . ,quote-controls?) + (always-quote? . ,always-quote?)) ) + +;; + +(define (make-csv-writer out-or-str #!optional (writer-spec '())) + (let ((make-spec-csv-writer (make-csv-writer-maker writer-spec))) + (make-spec-csv-writer out-or-str) ) ) + +(define (make-csv-writer-maker #!optional (writer-spec '())) + (let ((writer-spec + (writer-spec-with-defaults + (check-csv-writer-spec 'make-csv-writer-maker writer-spec)) ) ) + (lambda (out-or-str) + (let ( + (out + (cond + ((string? out-or-str) + (open-output-file out-or-str) ) + ((output-port? out-or-str) + out-or-str ) + (else + (error + 'csv-writer-maker + "invalid output-port or string" out-or-str) ) ) ) ) + (make-csv-line-writer 'csv-writer-maker out writer-spec) ) ) ) ) + +;; + +(define (make-csv-line-writer loc out writer-spec) + (let ( + (writer-spec + (check-csv-writer-spec loc writer-spec) ) + (newline-obj + (select-newline-object loc (alist-ref 'newline-char writer-spec eq?)) ) + (separator-char + (alist-ref 'separator-char writer-spec eq?) ) + (quote-char + (alist-ref 'quote-char writer-spec eq?) ) + (comment-char + (alist-ref 'comment-char writer-spec eq?) ) + (quote-doubling-escapes? + (alist-ref 'quote-doubling-escapes? writer-spec eq?) ) + (quote-controls? + (alist-ref 'quote-controls? writer-spec eq?) ) + (always-quote? + (alist-ref 'always-quote? writer-spec eq?) ) ) + ; + (let* ( + (quote-char-str (unicode-char->string quote-char) ) + (quote-char-str-2 (string-append quote-char-str quote-char-str)) ) + ; + (define (csv-line-object->string obj) + ; + (define (quote-doubling? str) + (and quote-doubling-escapes? (string-index str quote-char)) ) + ; + (define (quoting? str) + (or + always-quote? + (quote-doubling? str) + (and separator-char (string-index str separator-char)) + (and quote-controls? (string-index str char-set:iso-control))) ) + ; + (type-case obj + ((char) + (csv-line-object->string (unicode-char->string obj)) ) + ((symbol) + (csv-line-object->string (symbol->string obj)) ) + ((string) + (if (and quote-char (quoting? obj)) + (let ( + (str + (if (quote-doubling? obj) + (string-translate* obj `((,quote-char-str . ,quote-char-str-2))) + obj ) ) ) + ; + (conc quote-char str quote-char) ) + obj ) ) + (number + (csv-line-object->string (number->string obj)) ) + (else + (csv-line-object->string (->string obj)) ) ) ) + ; + (lambda (obj) + (let ( + ;build row to output as a string with a line-ending sequence + (lin + ;comment desired? + (if (list? obj) + ;row data + (let ((qstrs (map csv-line-object->string (check-list loc obj)))) + (apply + conc + (append! + (intersperse qstrs separator-char) + `(,newline-obj))) ) + ;are we supposed to do comments? + (if comment-char + (conc comment-char obj newline-obj) + obj + #; + (begin + (warning loc "comments not active" obj writer-spec) + "" ) ) ) ) ) + ; + (display lin out) ) ) ) ) ) + +;; + +(define (select-newline-object loc spec) + (case spec + ((cr) + #\return ) + ((lf) + #\newline ) + ((crlf) + CRLF-STR ) + (else + *system-newline* ) ) ) + +;; + +(define (writer-spec-with-defaults writer-spec) + `((newline-char . ,(alist-ref 'newline-char writer-spec eq? +newline-char-default+)) + (separator-char . ,(alist-ref 'separator-char writer-spec eq? +separator-char-default+)) + (quote-char . ,(alist-ref 'quote-char writer-spec eq? +quote-char-default+)) + (comment-char . ,(alist-ref 'comment-char writer-spec eq? +comment-char-default+)) + (quote-doubling-escapes? . ,(alist-ref 'quote-doubling-escapes? writer-spec eq? +quote-doubling-escapes?-default+)) + (quote-controls? . ,(alist-ref 'quote-controls? writer-spec eq? +quote-controls?-default+)) + (always-quote? . ,(alist-ref 'always-quote? writer-spec eq? +always-quote?-default+))) ) + +#| +;; + +(define (list->sxml-element ls row-element col-elements writer-spec) + (if (list? ls) + ;row data + `(,row-element ,@(map list col-elements (map ->string ls))) + ;are we supposed to do comments? + (if (alist-ref 'comment-char writer-spec eq?) + `(*COMMENT* ,(->string ls)) + ls ) ) ) + +(define (make-sxml-col-symbol n) + (string->symbol (string-append "col-" (number->string n))) ) + +(define +sxml-col-elements-default+ + (map make-sxml-col-symbol (sxml-col-iota)) ) + +(define (sxml-top-symbol) + +sxml-top-symbol+ ) + +(define (sxml-row-element-default) + +sxml-row-element-default+ ) + +(define (sxml-col-elements-default) + +sxml-col-elements-default+ ) + +(define (sxml-col-iota) + (iota +sxml-col-elements-limit-default+) ) +#; +(define (sxml-col-iota) + (do ((i 0 add1) + (ls '() (cons (make-sxml-col-symbol i) ls)) ) + ((= i +sxml-col-elements-limit-default+) ls) ) ) +|# ADDED csv-xml/csv-xml.meta Index: csv-xml/csv-xml.meta ================================================================== --- /dev/null +++ csv-xml/csv-xml.meta @@ -0,0 +1,19 @@ +;;;; csv-xml.meta -*- Hen -*- + +((egg "csv-xml.egg") + (date "2011-07-02") + (category parsing) + (author "Neil van Dyke") + (license "LGPL 3") + (doc-from-wiki) + (synopsis "Parsing comma-separated values") + (depends + (setup-helper "1.5.2") + (check-errors "2.0.2") + (moremacros "1.4.2") + (string-utils "1.5.5")) + (test-depends testeez test) + (files + "csv-xml.meta" "csv-xml.setup" + "csv-xml.scm" "csv-out.impl" "csv.ss" + "test/run.scm" "test/test-csv.ss") ) ADDED csv-xml/csv-xml.scm Index: csv-xml/csv-xml.scm ================================================================== --- /dev/null +++ csv-xml/csv-xml.scm @@ -0,0 +1,108 @@ +;;;; csv-xml.scm -*- Hen -*- +;;;; Kon Lovett, Jun '17 +;;;; Kon Lovett, ??? '?? + +(module csv-xml + +(;export + ; + reader-spec + ; + make-csv-reader + make-csv-reader-maker + ; + csv->list + csv->sxml + csv-for-each + csv-map + ; + csv-reader? check-csv-reader error-csv-reader + csv-reader-spec? check-csv-reader-spec error-csv-reader-spec + ; + writer-spec + ; + make-csv-writer-maker + make-csv-writer + ; + list->csv + #;list->sxml + ; + csv-writer? check-csv-writer error-csv-writer + csv-writer-spec? check-csv-writer-spec error-csv-writer-spec) + +(import scheme) + +#;(import (except chicken provide)) +(import chicken) + +;;; + +;Need to process `#lang' as well. So just "commented out" the "offending" +;sections in the source. +#;(define-syntax provide (syntax-rules () ((_ ?x0 ...) (begin)))) +(define null '()) + +(include "csv.ss") + +;;; + +(import (only data-structures conc intersperse ->string alist-ref string-translate*)) +(require-library data-structures) + +#;(import (only list-utils alist?)) +(import (only (srfi 1) every iota append! map)) +(require-library (srfi 1)) + +(import (only (srfi 13) string-index)) +(require-library (srfi 13)) + +(import (only (srfi 14) char-set:iso-control)) +(require-library (srfi 14)) + +(import (only type-checks define-check+error-type check-string check-list)) +(require-library type-checks) + +(import (only unicode-utils unicode-char->string)) +(require-library unicode-utils) + +(require-extension moremacros) + +;(from list-utils egg) +(define (alist? obj) + (if (pair? obj) + (every pair? obj) + (null? obj) ) ) + +;very loose ... +(define csv-reader-spec? alist?) +(define-check+error-type csv-reader-spec) + +(define csv-reader? procedure?) +(define-check+error-type csv-reader) + +(define (reader-spec + #!key + (newline-type 'lax) + (separator-chars '(#\,)) + (quote-char #\") + (quote-doubling-escapes? #t) + (comment-chars '()) + (whitespace-chars '(#\space)) + (strip-leading-whitespace? #f) + (strip-trailing-whitespace? #f) + (newlines-in-quotes? #t)) + `((newline-type . ,newline-type) + (separator-chars . ,separator-chars) + (quote-char . ,quote-char) + (quote-doubling-escapes? . ,quote-doubling-escapes?) + (comment-chars . ,comment-chars) + (whitespace-chars . ,whitespace-chars) + (strip-leading-whitespace? . ,strip-leading-whitespace?) + (strip-trailing-whitespace? . ,strip-trailing-whitespace?) + (newlines-in-quotes? . ,newlines-in-quotes?)) ) + +;;; + +(include "csv-out.impl") + +) ;csv-xml ADDED csv-xml/csv-xml.setup Index: csv-xml/csv-xml.setup ================================================================== --- /dev/null +++ csv-xml/csv-xml.setup @@ -0,0 +1,12 @@ +;;;; csv-xml.setup -*- Hen -*- + +(use setup-helper-mod) + +(verify-extension-name "csv-xml") + +(setup-shared+static-extension-module (extension-name) (extension-version "0.12.1") + #:types? #t + #:inline? #t + #:compile-options '( + -optimize-level 3 -debug-level 2 + -no-procedure-checks-for-toplevel-bindings -no-procedure-checks-for-usual-bindings)) ADDED csv-xml/csv.ss Index: csv-xml/csv.ss ================================================================== --- /dev/null +++ csv-xml/csv.ss @@ -0,0 +1,969 @@ +;;; @Package csv +;;; @Subtitle Comma-Separated Value (CSV) Utilities in Scheme +;;; @HomePage http://www.neilvandyke.org/csv-scheme/ +;;; @Author Neil Van Dyke +;;; @Version 0.10 +;;; @Date 2010-04-13 +;;; @PLaneT neil/csv:1:6 + +;; $Id: csv.ss,v 1.199 2010/04/13 17:56:20 neilpair Exp $ + +;;; @legal +;;; Copyright @copyright{} 2004--2009 Neil Van Dyke. This program is Free +;;; Software; you can redistribute it and/or modify it under the terms of the +;;; GNU Lesser General Public License as published by the Free Software +;;; Foundation; either version 3 of the License (LGPL 3), or (at your option) +;;; any later version. This program is distributed in the hope that it will be +;;; useful, but without any warranty; without even the implied warranty of +;;; merchantability or fitness for a particular purpose. See +;;; @indicateurl{http://www.gnu.org/licenses/} for details. For other licenses +;;; and consulting, please contact the author. +;;; @end legal + +;#lang scheme/base + +;;; @section Introduction + +;;; The @b{csv} Scheme library provides utilities for reading various kinds of +;;; what are commonly known as ``comma-separated value'' (CSV) files. Since +;;; there is no standard CSV format, this library permits CSV readers to be +;;; constructed from a specification of the peculiarities of a given variant. +;;; A default reader handles the majority of formats. +;;; +;;; One of the main uses of this library is to import data from old crusty +;;; legacy applications into Scheme for data conversion and other processing. +;;; To that end, this library includes various conveniences for iterating over +;;; parsed CSV rows, and for converting CSV input to the +;;; @uref{http://pobox.com/~oleg/ftp/Scheme/SXML.html, SXML 3.0} Scheme XML +;;; format. +;;; +;;; This library requires R5RS, SRFI-6, SRFI-23, and an @code{integer->char} +;;; procedure that accepts ASCII values. +;;; +;;; Other implementations of some kind of CSV reading for Scheme include +;;; Gauche's @code{text.csv} module, and Scsh's @code{record-reader} and +;;; related procedures. This library intends to be portable and more +;;; comprehensive. + +;; TODO: Briefly introduce terms "row", "column", and "field". + +(define-syntax %csv:error + (syntax-rules () ((_ p m o) + (error (string-append p " : " m) o) + ;; Bigloo: (error p m o) + ))) + +(define-syntax %csv:type-error + (syntax-rules () + ((_ proc-str expected-str got-value) + (%csv:error proc-str + (string-append "expected " expected-str ", received:") + got-value)))) + +(define %csv:a2c integer->char) + +(define %csv:cr (%csv:a2c 13)) +(define %csv:lf (%csv:a2c 10)) + +(define-syntax %csv:gosc + (syntax-rules () + ((_ os-stx) + (let* ((os os-stx) + (str (get-output-string os))) + (close-output-port os) + str)))) + +(define (%csv:in-arg proc-name in) + (cond ((input-port? in) in) + ((string? in) (open-input-string in)) + (else (%csv:type-error proc-name "input port or string" in)))) + +(define (%csv:reader-or-in-arg proc-name reader-or-in) + (cond ((procedure? reader-or-in) reader-or-in) + ((input-port? reader-or-in) (make-csv-reader reader-or-in)) + ((string? reader-or-in) (make-csv-reader (open-input-string + reader-or-in))) + (else (%csv:type-error proc-name + "csv reader or input port or string" + reader-or-in)))) + +;;; @section Reader Specs + +;;; CSV readers are constructed using @dfn{reader specs}, which are sets of +;;; attribute-value pairs, represented in Scheme as association lists keyed on +;;; symbols. Each attribute has a default value if not specified otherwise. +;;; The attributes are: + +;;; @table @code +;;; +;;; @item newline-type +;;; Symbol representing the newline, or record-terminator, convention. The +;;; convention can be a fixed character sequence (@code{lf}, @code{crlf}, or +;;; @code{cr}, corresponding to combinations of line-feed and carriage-return), +;;; any string of one or more line-feed and carriage-return characters +;;; (@code{lax}), or adaptive (@code{adapt}). @code{adapt} attempts to detect +;;; the newline convention at the start of the input and assume that convention +;;; for the remainder of the input. Default: @code{lax} +;;; +;;; @item separator-chars +;;; Non-null list of characters that serve as field separators. Normally, this +;;; will be a list of one character. Default: @code{(#\,)} (list of the comma +;;; character) +;;; +;;; @item quote-char +;;; Character that should be treated as the quoted field delimiter character, +;;; or @code{#f} if fields cannot be quoted. Note that there can be only one +;;; quote character. Default: @code{#\"} (double-quote) +;;; +;;; @item quote-doubling-escapes? +;;; Boolean for whether or not a sequence of two @code{quote-char} quote +;;; characters within a quoted field constitute an escape sequence for +;;; including a single @code{quote-char} within the string. Default: @code{#t} +;;; +;;; @item comment-chars +;;; List of characters, possibly null, which comment out the entire line of +;;; input when they appear as the first character in a line. Default: +;;; @code{()} (null list) +;;; +;;; @item whitespace-chars +;;; List of characters, possibly null, that are considered @dfn{whitespace} +;;; constituents for purposes of the @code{strip-leading-whitespace?} and +;;; @code{strip-trailing-whitespace?} attributes described below. +;;; Default: @code{(#\space)} (list of the space character) +;;; +;;; @item strip-leading-whitespace? +;;; Boolean for whether or not leading whitespace in fields should be +;;; stripped. Note that whitespace within a quoted field is never stripped. +;;; Default: @code{#f} +;;; +;;; @item strip-trailing-whitespace? +;;; Boolean for whether or not trailing whitespace in fields should be +;;; stripped. Note that whitespace within a quoted field is never stripped. +;;; Default: @code{#f} +;;; +;;; @item newlines-in-quotes? +;;; Boolean for whether or not newline sequences are permitted within quoted +;;; fields. If true, then the newline characters are included as part of the +;;; field value; if false, then the newline sequence is treated as a premature +;;; record termination. Default: @code{#t} +;;; +;;; @end table + +;; TODO: Do not expose this procedure for now. We expect it to go away and be +;; replaced with two other procedures. +;; +;; @defproc %csv:csv-spec-derive orig-spec changes +;; +;; Yields a new CSV spec that is derived from @var{orig-spec} by applying spec +;; @var{changes} as attribute substitions and additions to the original. For +;; example, given an original CSV reader spec: +;; +;; @lisp +;; (define my-first-csv-spec +;; '((newline-type . lax) +;; (separator-chars . (#\,)) +;; (quote-char . #\") +;; (quote-doubling-escapes? . #t) +;; (whitespace-chars . (#\space)))) +;; @end lisp +;; +;; a derived spec with a different @code{separator-chars} attribute and an +;; added @code{comment-chars} attribute can be created like: +;; +;; @lisp +;; (%csv:csv-spec-derive my-first-csv-spec +;; '((separator-chars . (#\%)) +;; (comment-chars . (#\#)))) +;; @result{} +;; ((separator-chars . (#\%)) +;; (comment-chars . (#\#)) +;; (newline-type . lax) +;; (quote-char . #\") +;; (quote-doubling-escapes? . #t) +;; (whitespace-chars . (#\space))) +;; @end lisp +;; +;; In that the yielded spec might share some structure with @var{orig-spec} +;; and/or @var{changes}. Most applications will not use this procedure +;; directly. + +(define (%csv:csv-spec-derive orig-spec changes) + ;; TODO: Make this not share structure. Error-check and normalize at the + ;; same time we clone. + (let ((new-spec '())) + (let ((add-to-new-spec + (lambda (alist) + (for-each (lambda (cell) + (or (assq (car cell) new-spec) + (set! new-spec (cons cell new-spec)))) + alist)))) + (add-to-new-spec changes) + (add-to-new-spec orig-spec) + (reverse new-spec)))) + +;;; @section Making Reader Makers + +;;; CSV readers are procedures that are constructed dynamically to close over a +;;; particular CSV input and yield a parsed row value each time the procedure +;;; is applied. For efficiency reasons, the reader procedures are themselves +;;; constructed by another procedure, @code{make-csv-reader-maker}, for +;;; particular CSV reader specs. + +(define (%csv:csv-error code extra) + ;; TODO: Maybe make the CSV error handler user-specifiable, or allow user to + ;; specify some errors that should be disregarded. + ;; + ;; TODO: Add position information. Keep track of character position while + ;; reading. + (%csv:error + "[csv-reader]" + (string-append "Erroneous CSV format: " + (case code + ((junk-after-quote-close) + "Junk after close of quoted field:") + (else (string-append "INTERNAL ERROR: Unknown code: " + (symbol->string code))))) + extra)) + +(define (%csv:newline-check-step0 newline-type c port) + ;; (display "*DEBUG* (equal? newline-type 'lax) = ") + ;; (write (equal? newline-type 'lax)) + ;; (newline) + ;; (display "*DEBUG* (eqv? newline-type 'lax) = ") + ;; (write (eqv? newline-type 'lax)) + ;; (newline) + (case newline-type + ((cr) (eqv? c %csv:cr)) + ((lf) (eqv? c %csv:lf)) + ((crlf) (if (eqv? c %csv:cr) + (let ((c2 (peek-char port))) + (cond ((eof-object? c2) + ;; Note: This is a CR-EOF in an input that uses CR-LF + ;; for terminating records. We are discarding the CR, + ;; so it will not be added to the field string. We + ;; might want to signal an error. + #t) + ((eqv? c2 %csv:lf) + (read-char port) + #t) + (else #f))) + #f)) + ((lax detect) (cond ((eqv? c %csv:cr) + (let ((c2 (peek-char port))) + (cond ((eof-object? c2) #t) + ((eqv? c2 %csv:lf) + (read-char port) + 'crlf) + (else 'cr)))) + ((eqv? c %csv:lf) 'lf) + (else #f))) + (else (%csv:error + "%csv:make-portreader/positional" + "unrecognized newline-type" + newline-type)))) + +(define %csv:make-portreader/positional + (letrec-syntax + ((newline-check + (syntax-rules () + ((_ newline-type c port detected-newline-type) + ;; Note: "port" and "detected-newline-type" must be identifiers. + ;; "newline-type" and "c" must be identifiers or self-evals. + (if (eqv? newline-type 'detect) + (begin (set! detected-newline-type + (%csv:newline-check-step0 newline-type c port)) + detected-newline-type) + (%csv:newline-check-step0 newline-type c port))))) + (gosc-cons + ;; Note: This is to ensure the output string is gotten and closed + ;; before consing it with the result of a recursive call. + (syntax-rules () + ((_ os b) (let ((s (%csv:gosc os))) (cons s b)))))) + (lambda (newline-type + separator-chars + quote-char + quote-doubling-escapes? + comment-chars + whitespace-chars + strip-leading-whitespace? + strip-trailing-whitespace? + newlines-in-quotes?) + (lambda (port) + (let ((dnlt #f) + (escape-char #\\)) + (let read-fields-or-eof ((c (read-char port))) + (cond + ((eof-object? c) '()) + ((and strip-leading-whitespace? (memv c whitespace-chars)) + ;; It's leading whitespace char when we're ignoring leading + ;; whitespace in fields, and there might just be whitespace and + ;; then an EOF, which should probably be considered just an EOF + ;; rather than a row with one empty field, so just skip this + ;; whitespace char. + (read-fields-or-eof (read-char port))) + ((and (not (null? comment-chars)) (memv c comment-chars)) + ;; It's a comment char in the first column (or in the first + ;; non-whitespace column, if "strip-leading-whitespace?" is + ;; true), so skip to end of line. + (let ((fake-dnlt #f)) + (let loop ((c (read-char port))) + (cond ((eof-object? c) '()) + ((newline-check newline-type c port fake-dnlt) + (read-fields-or-eof (read-char port))) + (else (loop (read-char port))))))) + (else + ;; It's not going to be just an EOF, so try to read a row. + (let ((row + (let read-fields ((c c)) + (cond + ;; If an EOF or newline in an unquoted field, consider + ;; the field and row finished. (We don't consider EOF + ;; before newline to be an error, although perhaps that + ;; would be a useful check for a freak premature + ;; end-of-input when dealing with "well-formed" CSV). + ((or (eof-object? c) + (newline-check newline-type c port dnlt)) + (list "")) + ;; If a field separator, finish this field and cons + ;; with value of recursive call to get the next field. + ((memv c separator-chars) + (cons "" (read-fields (read-char port)))) + ;; If we're ignoring leading whitespace, and it's a + ;; whitespace-chars character, then recurse to keep + ;; finding the field start. + ((and strip-leading-whitespace? + (memv c whitespace-chars)) + (read-fields (read-char port))) + ;; If a quote, read a quoted field. + ((and quote-char (eqv? c quote-char)) + (let ((os (open-output-string))) + (let loop ((c (read-char port))) + (cond + ((or (eof-object? c) + (and (not newlines-in-quotes?) + (newline-check newline-type + c port dnlt))) + (list (%csv:gosc os))) + ((and escape-char (eqv? c escape-char)) + ;FIXME can become unsynchronized + (write-char (read-char port) os) + (loop (read-char port))) + ((and quote-char (eqv? c quote-char)) + (if quote-doubling-escapes? + (let ((c2 (read-char port))) + (if (eqv? c2 quote-char) + (begin (write-char c2 os) + (loop (read-char port))) + (gosc-cons + os + (let skip-after ((c c2)) + (cond + ((or (eof-object? c) + (newline-check + newline-type c port dnlt)) + '()) + ((memv c separator-chars) + (read-fields (read-char port))) + ((memv c whitespace-chars) + ;; Note: We tolerate + ;; whitespace after field + ;; close quote even if + ;; skip-trailing-whitespace? + ;; is false. + (skip-after (read-char port))) + (else (%csv:csv-error + 'junk-after-quote-close + c))))))) + (gosc-cons os + (read-fields (read-char port))))) + (else (write-char c os) + (loop (read-char port))))))) + ;; It's the start of an unquoted field. + (else + (let ((os (open-output-string))) + (write-char c os) + (let loop ((c (read-char port))) + (cond + ((or (eof-object? c) + (newline-check newline-type c port dnlt)) + (list (get-output-string os))) + ((memv c separator-chars) + (gosc-cons os (read-fields (read-char port)))) + ((and strip-trailing-whitespace? + (memv c whitespace-chars)) + ;; TODO: Maybe optimize to avoid creating a new + ;; output string every time we see whitespace. + ;; We could use a string collector with unwrite. + ;; And/or do lookahead to see whether whitespace + ;; is only one character. Do this after we have + ;; a better regression test suite. + (let ((ws-os (open-output-string))) + (write-char c ws-os) + (let ws-loop ((c (read-char port))) + (cond + ((or (eof-object? c) + (newline-check + newline-type c port dnlt)) + (close-output-port ws-os) + (list (%csv:gosc os))) + ((memv c separator-chars) + (close-output-port ws-os) + (gosc-cons os (read-fields (read-char + port)))) + ((memv c whitespace-chars) + (write-char c ws-os) + (ws-loop (read-char port))) + (else + (display (%csv:gosc ws-os) os) + (write-char c os) + (loop (read-char port))))))) + (else (write-char c os) + (loop (read-char port))))))))))) + (if (null? row) + row + (if (eq? newline-type 'detect) + (cons dnlt row) + row))))))))))) + +(define %csv:make-portreader + ;; TODO: Make a macro for the three times we list the spec attributes. + (letrec ((pb (lambda (x) (if x #t #f))) + (pc (lambda (x) + (cond ((char? x) x) + ((string? x) (case (string-length x) + ((1) (string-ref x 0)) + (else (%csv:type-error + "make-csv-reader-maker" + "character" + x)))) + (else (%csv:type-error "make-csv-reader-maker" + "character" + x))))) + (pc-f (lambda (x) + (cond ((not x) x) + ((char? x) x) + ((string? x) (case (string-length x) + ((0) #f) + ((1) (string-ref x 0)) + (else (%csv:type-error + "make-csv-reader-maker" + "character or #f" + x)))) + (else (%csv:type-error "make-csv-reader-maker" + "character or #f" + x))))) + (pe (lambda (x acceptable) + (if (memq x acceptable) + x + (%csv:type-error + "make-csv-reader-maker" + (let ((os (open-output-string))) + (display "symbol from the set " os) + (write acceptable os) + (%csv:gosc os)) + x)))) + (plc-n (lambda (x) + (or (list? x) + (%csv:type-error "make-csv-reader-maker" + "list of characters" + x)) + (map pc x))) + (plc (lambda (x) + (let ((result (plc-n x))) + (if (null? result) + (%csv:type-error "make-csv-reader-maker" + "non-null list of characters" + x) + result))))) + (lambda (reader-spec) + (let ((newline-type 'lax) + (separator-chars '(#\,)) + (quote-char #\") + (quote-doubling-escapes? #t) + (comment-chars '()) + (whitespace-chars '(#\space)) + (strip-leading-whitespace? #f) + (strip-trailing-whitespace? #f) + (newlines-in-quotes? #t)) + ;; TODO: It's erroneous to have two entries for the same attribute in a + ;; spec. However, it would be nice if we error-detected duplicate + ;; entries, or at least had assq semantics (first, rather than last, + ;; wins). Use csv-spec-derive's descendants for that. + (for-each + (lambda (item) + (let ((v (cdr item))) + (case (car item) + ((newline-type) + (set! newline-type (pe v '(cr crlf detect lax lf)))) + ((separator-chars) + (set! separator-chars (plc v))) + ((quote-char) + (set! quote-char (pc-f v))) + ((quote-doubling-escapes?) + (set! quote-doubling-escapes? (pb v))) + ((comment-chars) + (set! comment-chars (plc-n v))) + ((whitespace-chars) + (set! whitespace-chars (plc-n v))) + ((strip-leading-whitespace?) + (set! strip-leading-whitespace? (pb v))) + ((strip-trailing-whitespace?) + (set! strip-trailing-whitespace? (pb v))) + ((newlines-in-quotes?) + (set! newlines-in-quotes? (pb v)))))) + reader-spec) + (%csv:make-portreader/positional + newline-type + separator-chars + quote-char + quote-doubling-escapes? + comment-chars + whitespace-chars + strip-leading-whitespace? + strip-trailing-whitespace? + newlines-in-quotes?))))) + +;;; @defproc make-csv-reader-maker reader-spec +;;; +;;; Constructs a CSV reader constructor procedure from the @var{reader-spec}, +;;; with unspecified attributes having their default values. +;;; +;;; For example, given the input file @code{fruits.csv} with the content: +;;; +;;; @example +;;; apples | 2 | 0.42 +;;; bananas | 20 | 13.69 +;;; @end example +;;; +;;; a reader for the file's apparent format can be constructed like: +;;; +;;; @lisp +;;; (define make-food-csv-reader +;;; (make-csv-reader-maker +;;; '((separator-chars . (#\|)) +;;; (strip-leading-whitespace? . #t) +;;; (strip-trailing-whitespace? . #t)))) +;;; @end lisp +;;; +;;; The resulting @code{make-food-csv-reader} procedure accepts one argument, +;;; which is either an input port from which to read, or a string from which to +;;; read. Our example input file then can be be read by opening an input port +;;; on a file and using our new procedure to construct a reader on it: +;;; +;;; @lisp +;;; (define next-row +;;; (make-food-csv-reader (open-input-file "fruits.csv"))) +;;; @end lisp +;;; +;;; This reader, @code{next-row}, can then be called repeatedly to yield a +;;; parsed representation of each subsequent row. The parsed format is a list +;;; of strings, one string for each column. The null list is yielded to +;;; indicate that all rows have already been yielded. +;;; +;;; @lisp +;;; (next-row) @result{} ("apples" "2" "0.42") +;;; (next-row) @result{} ("bananas" "20" "13.69") +;;; (next-row) @result{} () +;;; @end lisp + +(define (make-csv-reader-maker reader-spec) + (let ((make-portread + (if (let ((p (assq 'newline-type reader-spec))) (and p (cdr p))) + ;; Newline-adapting portreader-maker. + (letrec + ((detect-portread + (%csv:make-portreader + (%csv:csv-spec-derive reader-spec + '((newline-type . detect))))) + ;; TODO: The set of cr/crlf/lf newline-type portreaders are + ;; constructed optimistically right now for two reasons: + ;; 1. we don't yet sanitize reader-specs of shared structure + ;; that can be mutated behind our backs; 2. eventually, we + ;; want to add a "lots-o-shots?" argument that, when true, + ;; would do this anyway. Consider. + (cr-portread + (%csv:make-portreader + (%csv:csv-spec-derive reader-spec + '((newline-type . cr))))) + (crlf-portread + (%csv:make-portreader + (%csv:csv-spec-derive reader-spec + '((newline-type . crlf))))) + (lf-portread + (%csv:make-portreader + (%csv:csv-spec-derive reader-spec + '((newline-type . lf)))))) + (lambda () + (let ((actual-portread #f)) + (let ((adapt-portread + (lambda (port) + (let ((dnlt-row (detect-portread port))) + (if (null? dnlt-row) + dnlt-row + (begin (set! actual-portread + (case (car dnlt-row) + ((cr) cr-portread) + ((crlf) crlf-portread) + ((lf) lf-portread) + (else actual-portread))) + (cdr dnlt-row))))))) + (set! actual-portread adapt-portread) + (lambda (port) (actual-portread port)))))) + ;; Stateless portreader-maker. + (let ((reusable-portread + (%csv:make-portreader reader-spec))) + (lambda () reusable-portread))))) + (lambda (in) + (let ((port (%csv:in-arg "[csv-reader]" in)) + (portread (make-portread))) + (lambda () (portread port)))))) + +;;; @section Making Readers + +;;; In addition to being constructed from the result of +;;; @code{make-csv-reader-maker}, CSV readers can also be constructed using +;;; @code{make-csv-reader}. + +;;; @defproc make-csv-reader in [reader-spec] +;;; +;;; Construct a CSV reader on the input @var{in}, which is an input port or a +;;; string. If @var{reader-spec} is given, and is not the null list, then a +;;; ``one-shot'' reader constructor is constructed with that spec and used. If +;;; @var{reader-spec} is not given, or is the null list, then the default CSV +;;; reader constructor is used. For example, the reader from the +;;; @code{make-csv-reader-maker} example could alternatively have been +;;; constructed like: +;;; +;;; @lisp +;;; (define next-row +;;; (make-csv-reader +;;; (open-input-file "fruits.csv") +;;; '((separator-chars . (#\|)) +;;; (strip-leading-whitespace? . #t) +;;; (strip-trailing-whitespace? . #t)))) +;;; @end lisp + +(define make-csv-reader + (let ((default-maker (make-csv-reader-maker '()))) + (lambda (in . rest) + (let ((spec (cond ((null? rest) '()) + ((null? (cdr rest)) (car rest)) + (else (%csv:error "make-csv-reader" + "extraneous arguments" + (cdr rest)))))) + ((if (null? spec) + default-maker + (make-csv-reader-maker spec)) + (%csv:in-arg "make-csv-reader" in)))))) + +;;; @section High-Level Conveniences + +;;; Several convenience procedures are provided for iterating over the CSV rows +;;; and for converting the CSV to a list. +;;; +;;; To the dismay of some Scheme purists, each of these procedures accepts a +;;; @var{reader-or-in} argument, which can be a CSV reader, an input port, or a +;;; string. If not a CSV reader, then the default reader constructor is used. +;;; For example, all three of the following are equivalent: +;;; +;;; @lisp +;;; (csv->list STRING ) +;;; @equiv{} +;;; (csv->list (make-csv-reader STRING )) +;;; @equiv{} +;;; (csv->list (make-csv-reader (open-input-string STRING ))) +;;; @end lisp + +;;; @defproc csv-for-each proc reader-or-in +;;; +;;; Similar to Scheme's @code{for-each}, applies @var{proc}, a procedure of one +;;; argument, to each parsed CSV row in series. @var{reader-or-in} is the CSV +;;; reader, input port, or string. The return value is undefined. + +;; TODO: Doc an example for this. + +(define (csv-for-each proc reader-or-in) + (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) + (let loop ((row (reader))) + (or (null? row) + (begin (proc row) + (loop (reader))))))) + +;;; @defproc csv-map proc reader-or-in +;;; +;;; Similar to Scheme's @code{map}, applies @var{proc}, a procedure of one +;;; argument, to each parsed CSV row in series, and yields a list of the values +;;; of each application of @var{proc}, in order. @var{reader-or-in} is the CSV +;;; reader, input port, or string. + +;; TODO: Doc an example for this. + +;; (define (csv-map proc reader-or-in) +;; (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) +;; (let ((head '())) +;; (let ((row (reader))) +;; (if (null? row) +;; head +;; (let ((pair (cons (proc row) '()))) +;; (set! head pair) +;; (let loop ((prior pair)) +;; (let ((row (reader))) +;; (if (null? row) +;; head +;; (let ((pair (cons (proc row) '()))) +;; (set-cdr! prior pair) +;; (loop pair))))))))))) + +(define (csv-map proc reader-or-in) + (let ((reader (%csv:reader-or-in-arg "csv-for-each" reader-or-in))) + (let loop ((row (reader)) (ret null)) + (if (null? row) + (reverse ret) + (let ((ret (cons (proc row) ret))) + (loop (reader) ret)))))) + +;;; @defproc csv->list reader-or-in +;;; +;;; Yields a list of CSV row lists from input @var{reader-or-in}, which is a +;;; CSV reader, input port, or string. + +;; TODO: Doc an example for this. + +;; (define (csv->list reader-or-in) +;; (let ((reader (%csv:reader-or-in-arg "csv->list" reader-or-in))) +;; (let ((head '())) +;; (let ((row (reader))) +;; (if (null? row) +;; head +;; (let ((pair (cons row '()))) +;; (set! head pair) +;; (let loop ((prior pair)) +;; (let ((row (reader))) +;; (if (null? row) +;; head +;; (let ((pair (cons row '()))) +;; (set-cdr! prior pair) +;; (loop pair))))))))))) + +(define (csv->list reader-or-in) + (csv-map values reader-or-in)) + +;;; @section Converting CSV to SXML + +;;; The @code{csv->sxml} procedure can be used to convert CSV to SXML format, +;;; for processing with various XML tools. + +;;; @defproc csv->sxml reader-or-in [row-element [col-elements]] +;;; +;;; Reads CSV from input @var{reader-or-in} (which is a CSV reader, input port, +;;; or string), and yields an SXML representation. If given, @var{row-element} +;;; is a symbol for the XML row element. If @var{row-element} is not given, +;;; the default is the symbol @code{row}. If given @var{col-elements} is a +;;; list of symbols for the XML column elements. If not given, or there are +;;; more columns in a row than given symbols, column element symbols are of the +;;; format @code{col-@var{n}}, where @var{n} is the column number (the first +;;; column being number 0, not 1). +;;; +;;; For example, given a CSV-format file @code{friends.csv} that has the +;;; contents: +;;; +;;; @example +;;; Binoche,Ste. Brune,33-1-2-3 +;;; Posey,Main St.,555-5309 +;;; Ryder,Cellblock 9, +;;; @end example +;;; +;;; with elements not given, the result is: +;;; +;;; @lisp +;;; (csv->sxml (open-input-file "friends.csv")) +;;; @result{} +;;; (*TOP* +;;; (row (col-0 "Binoche") (col-1 "Ste. Brune") (col-2 "33-1-2-3")) +;;; (row (col-0 "Posey") (col-1 "Main St.") (col-2 "555-5309")) +;;; (row (col-0 "Ryder") (col-1 "Cellblock 9") (col-2 ""))) +;;; @end lisp +;;; +;;; With elements given, the result is like: +;;; +;;; @lisp +;;; (csv->sxml (open-input-file "friends.csv") +;;; 'friend +;;; '(name address phone)) +;;; @result{} +;;; (*TOP* (friend (name "Binoche") +;;; (address "Ste. Brune") +;;; (phone "33-1-2-3")) +;;; (friend (name "Posey") +;;; (address "Main St.") +;;; (phone "555-5309")) +;;; (friend (name "Ryder") +;;; (address "Cellblock 9") +;;; (phone ""))) +;;; @end lisp + +(define csv->sxml + (let* ((top-symbol + (string->symbol "*TOP*")) + (make-col-symbol + (lambda (n) + (string->symbol (string-append "col-" (number->string n))))) + (default-col-elements + (let loop ((i 0)) + (if (= i 32) ; arbitrary magic number + '() + (cons (make-col-symbol i) (loop (+ 1 i))))))) + ;; TODO: Have option to error when columns count doesn't match provided + ;; column name list. + (lambda (reader-or-in . rest) + (let ((reader (%csv:reader-or-in-arg "csv->sxml" + reader-or-in)) + (row-element 'row) + (col-elements #f)) + ;; TODO: Maybe use case-lambda. + (or (null? rest) + (begin (set! row-element (car rest)) + (let ((rest (cdr rest))) + (or (null? rest) + (begin (set! col-elements (car rest)) + (let ((rest (cdr rest))) + (or (null? rest) + (%csv:error + "csv->sxml" + "extraneous arguments" + rest)))))))) + ;; TODO: We could clone and grow default-col-elements for the duration + ;; of this procedure. + (cons top-symbol + (csv-map (lambda (row) + (cons row-element + (let loop ((vals row) + (i 0) + (names (or col-elements + default-col-elements))) + (if (null? vals) + '() + (cons (list (if (null? names) + (make-col-symbol i) + (car names)) + (car vals)) + (loop (cdr vals) + (+ 1 i) + (if (null? names) + '() + (cdr names)))))))) + reader)))))) + +;; TODO: Make a define-csv-reader/positional, for great constant-folding. +;; That's part of the reason some things are done the way they are. + +;; TODO: Make a csv-bind, as a newbie convenience for people without advanced +;; match forms, which looks good in examples. This is better than a +;; csv-map/bind and a csv-for-each/bind. +;; +;; (csv-for-each/bind ((column-binding ...) body ...) +;; { (else => closure) | (else body ...) | } +;; input-port +;; [ csv-reader ]) +;; +;; (csv-for-each/bind +;; ((lastname firstname email) +;; ...) +;; (else => (lambda (row) (error "CSV row didn't match pattern" row))) +;; my-input-port +;; my-csv-reader) + +;; TODO: Handle escapes, once we find an actual example or specification of any +;; flavor of escapes in CSV other than quote-doubling inside a quoted field. + +;; TODO: Add a spec attribute for treating adjacent separators as one, or +;; skipping empty fields. This would probably only be used in practice for +;; parsing whitespace-separated input. + +;; TODO: Get access to MS Excel or documentation, and make this correct. +;; +;; (define msexcel-csv-reader-spec +;; '((newline-type . crlf) +;; (separator-chars . (#\,)) +;; (quote-char . #\") +;; (quote-doubling-escapes? . #t) +;; (comment-chars . ()) +;; (whitespace-chars . (#\space)) +;; (strip-leading-whitespace? . #f) +;; (strip-trailing-whitespace? . #f) +;; (newlines-in-quotes? . #t))) + +;; TODO: Maybe put this back in. +;; +;; (define default-csv-reader-spec +;; '((newline-type . lax) +;; (separator-chars . (#\,)) +;; (quote-char . #\") +;; (quote-doubling-escapes? . #t) +;; (comment-chars . ()) +;; (whitespace-chars . (#\space)) +;; (strip-leading-whitespace? . #f) +;; (strip-trailing-whitespace? . #f) +;; (newlines-in-quotes? . #t))) + +;; TODO: Implement CSV writing, after CSV reading is field-tested and polished. + +;; TODO: Call "close-input-port" once eof-object is hit, but make sure we still +;; can return an empty list on subsequent calls to the CSV reader. + +;; TODO: Consider switching back to returning eof-object at the end of input. +;; We originally changed to returning the null list because we might want to +;; synthesize the EOF, and there is no R5RS binding for the eof-object. + +;; TODO: [2005-12-09] In one test, Guile has a stack overflow when parsing a +;; row with 425 columns. Wouldn't hurt to see if we can make things more +;; tail-recursive. + +;;; @unnumberedsec History + +;;; @table @asis +;;; +;;; @item Version 0.10 -- 2010-04-13 -- PLaneT @code{(1 6)} +;;; Documentation fix. +;;; +;;; @item Version 0.9 -- 2009-03-14 -- PLaneT @code{(1 5)} +;;; Documentation fix. +;;; +;;; @item Version 0.8 -- 2009-02-23 -- PLaneT @code{(1 4)} +;;; Documentation changes. +;;; +;;; @item Version 0.7 -- 2009-02-22 -- PLaneT @code{(1 3)} +;;; License is now LGPL 3. Moved to author's new Scheme administration system. +;;; +;;; @item Version 0.6 -- 2008-08-12 -- PLaneT @code{(1 2)} +;;; For PLT 4 compatibility, new versions of @code{csv-map} and +;;; @code{csv->list} that don't use @code{set-cdr!} (courtesy of Doug +;;; Orleans). PLT 4 @code{if} compatibility change. Minor documentation fixes. +;;; +;;; @item Version 0.5 --- 2005-12-09 +;;; Changed a non-R5RS use of @code{letrec} to @code{let*}, caught by Guile and +;;; David Pirotte. +;;; +;;; @item Version 0.4 --- 2005-06-07 +;;; Converted to Testeez. Minor documentation changes. +;;; +;;; @item Version 0.3 --- 2004-07-21 +;;; Minor documentation changes. Test suite now disabled by default. +;;; +;;; @item Version 0.2 --- 2004-06-01 +;;; Work-around for @code{case}-related bug observed in Gauche 0.8 and 0.7.4.2 +;;; that was tickled by @code{csv-internal:make-portreader/positional}. Thanks +;;; to Grzegorz Chrupa@l{}a for reporting. +;;; +;;; @item Version 0.1 --- 2004-05-31 +;;; First release, for testing with real-world input. +;;; +;;; @end table + +#;(provide + csv->list + csv->sxml + csv-for-each + csv-map + make-csv-reader + make-csv-reader-maker)