|
20 | 20 | #include "minsky.h" |
21 | 21 | #include "CSVParser.h" |
22 | 22 |
|
| 23 | +#include "CSVTools.rcd" |
23 | 24 | #include "CSVParser.rcd" |
24 | 25 | #include "dataSpecSchema.rcd" |
25 | 26 | #include "dimension.rcd" |
|
36 | 37 |
|
37 | 38 | using namespace minsky; |
38 | 39 | using namespace std; |
| 40 | +using ravel::Parser; |
| 41 | +using ravel::SpaceSeparatorParser; |
| 42 | +using ravel::getWholeLine; |
39 | 43 |
|
40 | 44 | #include <boost/type_traits.hpp> |
41 | 45 | #include <boost/tokenizer.hpp> |
42 | 46 | #include <boost/token_functions.hpp> |
43 | 47 | #include <boost/pool/pool.hpp> |
44 | 48 |
|
45 | | -namespace escapedListSeparator |
46 | | -{ |
47 | | - // pinched from boost::escape_list_separator, and modified to not throw |
48 | | - template <class Char, |
49 | | - class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
50 | | - class EscapedListSeparator { |
51 | | - |
52 | | - private: |
53 | | - typedef std::basic_string<Char,Traits> string_type; |
54 | | - struct char_eq { |
55 | | - Char e_; |
56 | | - char_eq(Char e):e_(e) { } |
57 | | - bool operator()(Char c) { |
58 | | - return Traits::eq(e_,c); |
59 | | - } |
60 | | - }; |
61 | | - string_type escape_; |
62 | | - string_type c_; |
63 | | - string_type quote_; |
64 | | - bool last_; |
65 | | - |
66 | | - bool is_escape(Char e) { |
67 | | - const char_eq f(e); |
68 | | - return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); |
69 | | - } |
70 | | - bool is_c(Char e) { |
71 | | - const char_eq f(e); |
72 | | - return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); |
73 | | - } |
74 | | - bool is_quote(Char e) { |
75 | | - const char_eq f(e); |
76 | | - return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); |
77 | | - } |
78 | | - template <typename iterator, typename Token> |
79 | | - void do_escape(iterator& next,iterator end,Token& tok) { |
80 | | - if (++next >= end) |
81 | | - // don't throw, but pass on verbatim |
82 | | - tok+=escape_.front(); |
83 | | - if (Traits::eq(*next,'n')) { |
84 | | - tok+='\n'; |
85 | | - return; |
86 | | - } |
87 | | - if (is_quote(*next)) { |
88 | | - tok+=*next; |
89 | | - return; |
90 | | - } |
91 | | - if (is_c(*next)) { |
92 | | - tok+=*next; |
93 | | - return; |
94 | | - } |
95 | | - if (is_escape(*next)) { |
96 | | - tok+=*next; |
97 | | - return; |
98 | | - } |
99 | | - // don't throw, but pass on verbatim |
100 | | - tok+=escape_.front()+*next; |
101 | | - } |
102 | | - |
103 | | - public: |
104 | | - |
105 | | - explicit EscapedListSeparator(Char e = '\\', |
106 | | - Char c = ',',Char q = '\"') |
107 | | - : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } |
108 | | - |
109 | | - EscapedListSeparator(string_type e, string_type c, string_type q) |
110 | | - : escape_(e), c_(c), quote_(q), last_(false) { } |
111 | | - |
112 | | - void reset() {last_=false;} |
113 | | - |
114 | | - template <typename InputIterator, typename Token> |
115 | | - bool operator()(InputIterator& next,InputIterator end,Token& tok) { |
116 | | - bool bInQuote = false; |
117 | | - tok = Token(); |
118 | | - |
119 | | - if (next >= end) { |
120 | | - next=end; // reset next in case it has adavanced beyond |
121 | | - if (last_) { |
122 | | - last_ = false; |
123 | | - return true; |
124 | | - } |
125 | | - return false; |
126 | | - } |
127 | | - last_ = false; |
128 | | - while (next < end) { |
129 | | - if (is_escape(*next)) { |
130 | | - do_escape(next,end,tok); |
131 | | - } |
132 | | - else if (is_c(*next)) { |
133 | | - if (!bInQuote) { |
134 | | - // If we are not in quote, then we are done |
135 | | - ++next; |
136 | | - // The last character was a c, that means there is |
137 | | - // 1 more blank field |
138 | | - last_ = true; |
139 | | - return true; |
140 | | - } |
141 | | - tok+=*next; |
142 | | - } |
143 | | - else if (is_quote(*next)) { |
144 | | - bInQuote=!bInQuote; |
145 | | - } |
146 | | - else { |
147 | | - tok += *next; |
148 | | - } |
149 | | - ++next; |
150 | | - } |
151 | | - return true; |
152 | | - } |
153 | | - }; |
154 | | -} |
155 | | -using Parser=escapedListSeparator::EscapedListSeparator<char>; |
156 | | - |
157 | | -typedef boost::tokenizer<Parser> Tokenizer; |
158 | | - |
159 | | -struct SpaceSeparatorParser |
160 | | -{ |
161 | | - char escape, quote; |
162 | | - SpaceSeparatorParser(char escape='\\', char sep=' ', char quote='"'): |
163 | | - escape(escape), quote(quote) {} |
164 | | - template <class I> |
165 | | - bool operator()(I& next, I end, std::string& tok) |
166 | | - { |
167 | | - tok.clear(); |
168 | | - bool quoted=false; |
169 | | - while (next!=end) |
170 | | - { |
171 | | - if (*next==escape) |
172 | | - tok+=*(++next); |
173 | | - else if (*next==quote) |
174 | | - quoted=!quoted; |
175 | | - else if (!quoted && isspace(*next)) |
176 | | - { |
177 | | - while (isspace(*next)) ++next; |
178 | | - return true; |
179 | | - } |
180 | | - else |
181 | | - tok+=*next; |
182 | | - ++next; |
183 | | - } |
184 | | - return !tok.empty(); |
185 | | - } |
186 | | - void reset() {} |
187 | | -}; |
188 | | - |
189 | 49 | namespace |
190 | 50 | { |
191 | 51 | /// An any with cached hash |
@@ -383,6 +243,7 @@ void DataSpec::setDataArea(size_t row, size_t col) |
383 | 243 | dataCols.erase(i); |
384 | 244 | for (unsigned i=m_nColAxes; i<numCols && i<maxColumn; ++i) |
385 | 245 | dataCols.insert(i); |
| 246 | + toSchema(); |
386 | 247 | } |
387 | 248 |
|
388 | 249 |
|
@@ -573,49 +434,6 @@ void DataSpec::populateFromRavelMetadata(const std::string& metadata, const stri |
573 | 434 |
|
574 | 435 | namespace minsky |
575 | 436 | { |
576 | | - // handle DOS files with '\r' '\n' line terminators |
577 | | - void chomp(string& buf) |
578 | | - { |
579 | | - if (!buf.empty() && buf.back()=='\r') |
580 | | - buf.erase(buf.size()-1); |
581 | | - } |
582 | | - |
583 | | - // gets a line, accounting for quoted newlines |
584 | | - bool getWholeLine(istream& input, string& line, const DataSpec& spec) |
585 | | - { |
586 | | - line.clear(); |
587 | | - bool r=getline(input,line).good(); |
588 | | - chomp(line); |
589 | | - while (r) |
590 | | - { |
591 | | - int quoteCount=0; |
592 | | - for (auto i: line) |
593 | | - if (i==spec.quote) |
594 | | - ++quoteCount; |
595 | | - if (quoteCount%2==0) break; // data line correctly terminated |
596 | | - string buf; |
597 | | - r=getline(input,buf).good(); // read next line and append |
598 | | - chomp(buf); |
599 | | - line+=buf; |
600 | | - } |
601 | | - escapeDoubledQuotes(line,spec); |
602 | | - return r || !line.empty(); |
603 | | - } |
604 | | - |
605 | | - void escapeDoubledQuotes(std::string& line,const DataSpec& spec) |
606 | | - { |
607 | | - // replace doubled quotes with escape quote |
608 | | - for (size_t i=1; i<line.size(); ++i) |
609 | | - if (line[i]==spec.quote && line[i-1]==spec.quote && |
610 | | - ((i==1 && (i==line.size()-1|| line[i+1]!=spec.quote)) || // deal with leading "" |
611 | | - (i>1 && |
612 | | - ((line[i-2]!=spec.quote && line[i-2]!=spec.escape && |
613 | | - (line[i-2]!=spec.separator || i==line.size()-1|| line[i+1]!=spec.quote)) // deal with ,'' |
614 | | - || // deal with "" middle or end |
615 | | - (line[i-2]==spec.quote && (i==2 || line[i-3]==spec.separator || line[i-3]==spec.escape)))))) // deal with leading """ |
616 | | - line[i-1]=spec.escape; |
617 | | - } |
618 | | - |
619 | 437 | /// handle reporting errors in loadValueFromCSVFileT when loading files |
620 | 438 | struct OnError |
621 | 439 | { |
|
0 commit comments