paludis  Version 2.6.0
tokeniser.hh
Go to the documentation of this file.
1 /* vim: set sw=4 sts=4 et foldmethod=syntax : */
2 
3 /*
4  * Copyright (c) 2006, 2007, 2010 Ciaran McCreesh
5  *
6  * This file is part of the Paludis package manager. Paludis is free software;
7  * you can redistribute it and/or modify it under the terms of the GNU General
8  * Public License version 2, as published by the Free Software Foundation.
9  *
10  * Paludis is distributed in the hope that it will be useful, but WITHOUT ANY
11  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13  * details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
17  * Place, Suite 330, Boston, MA 02111-1307 USA
18  */
19 
20 #ifndef PALUDIS_GUARD_PALUDIS_TOKENISER_HH
21 #define PALUDIS_GUARD_PALUDIS_TOKENISER_HH 1
22 
23 #include <iterator>
26 #include <string>
27 
28 /** \file
29  * Declarations for Tokeniser and related utilities.
30  *
31  * \ingroup g_strings
32  *
33  * \section Examples
34  *
35  * - None at this time.
36  */
37 
38 namespace paludis
39 {
40  /**
41  * Delimiter policy for Tokeniser.
42  *
43  * \ingroup g_strings
44  */
45  namespace delim_kind
46  {
47  /**
48  * Any of the characters split, and the delimiter is discarded.
49  *
50  * \ingroup g_strings
51  */
52  struct AnyOfTag;
53  }
54 
55  /**
56  * Delimiter mode for Tokeniser.
57  *
58  * \ingroup g_strings
59  */
60  namespace delim_mode
61  {
62  /**
63  * Discard the delimiters.
64  *
65  * \ingroup g_strings
66  */
67  struct DelimiterTag;
68 
69  /**
70  * Keep the delimiters.
71  *
72  * \ingroup g_strings
73  */
74  struct BoundaryTag;
75  }
76 
77  /**
78  * Tokeniser internal use only.
79  *
80  * \ingroup g_strings
81  */
82  namespace tokeniser_internals
83  {
84  /**
85  * A Writer handles Tokeniser's writes.
86  *
87  * \ingroup g_strings
88  */
89  template <typename DelimMode_, typename Iter_>
90  struct Writer;
91 
92  /**
93  * A Writer handles Tokeniser's writes (specialisation for
94  * delim_mode::DelimiterTag).
95  *
96  * \ingroup g_strings
97  */
98  template <typename Iter_>
99  struct Writer<delim_mode::DelimiterTag, Iter_>
100  {
101  /**
102  * Handle a token.
103  */
104  static void handle_token(const std::string & s, Iter_ & i)
105  {
106  *i++ = s;
107  }
108 
109  /**
110  * Handle a delimiter.
111  */
112  static void handle_delim(const std::string &, const Iter_ &)
113  {
114  }
115  };
116 
117  /**
118  * A Writer handles Tokeniser's writes (specialisation for
119  * delim_mode::BoundaryTag).
120  *
121  * \ingroup g_strings
122  */
123  template <typename Iter_>
124  struct Writer<delim_mode::BoundaryTag, Iter_>
125  {
126  /**
127  * Handle a token.
128  */
129  static void handle_token(const std::string & s, Iter_ & i)
130  {
131  *i++ = s;
132  }
133 
134  /**
135  * Handle a delimiter.
136  */
137  static void handle_delim(const std::string & s, Iter_ & i)
138  {
139  *i++ = s;
140  }
141  };
142 
143  struct Lexer
144  {
145  const std::string text;
146  std::string::size_type text_pos;
147  std::string delims;
148  const std::string quotes;
149 
150  std::string value;
151  enum { t_quote, t_delim, t_text } kind;
152 
153  Lexer(const std::string & t, const std::string & d, const std::string & q) :
154  text(t),
155  text_pos(0),
156  delims(d),
157  quotes(q)
158  {
159  }
160 
161  bool next()
162  {
163  if (text_pos >= text.length())
164  return false;
165 
166  if (std::string::npos != delims.find(text[text_pos]))
167  {
168  std::string::size_type start_pos(text_pos);
169  while (++text_pos < text.length())
170  if (std::string::npos == delims.find(text[text_pos]))
171  break;
172 
173  value = text.substr(start_pos, text_pos - start_pos);
174  kind = t_delim;
175  }
176  else if (std::string::npos != quotes.find(text[text_pos]))
177  {
178  value = std::string(1, text[text_pos]);
179  kind = t_quote;
180  ++text_pos;
181  }
182  else
183  {
184  std::string::size_type start_pos(text_pos);
185  while (++text_pos < text.length())
186  if (std::string::npos != delims.find(text[text_pos]))
187  break;
188  else if (std::string::npos != quotes.find(text[text_pos]))
189  break;
190  value = text.substr(start_pos, text_pos - start_pos);
191  kind = t_text;
192  }
193 
194  return true;
195  }
196  };
197 
198  template <typename DelimKind_, typename DelimMode_ = delim_mode::DelimiterTag>
199  class Tokeniser;
200 
201  template <typename DelimMode_>
202  class Tokeniser<delim_kind::AnyOfTag, DelimMode_>
203  {
204  private:
205  Tokeniser();
206 
207  public:
208  template <typename Iter_>
209  static void tokenise(const std::string & s,
210  const std::string & delims,
211  const std::string & quotes,
212  Iter_ iter);
213  };
214  }
215 
216  /**
217  * Thrown if a Tokeniser encounters a syntax error (for example, mismatched quotes).
218  *
219  * \ingroup g_strings
220  * \since 0.26
221  */
223  public Exception
224  {
225  public:
226  ///\name Basic operations
227  ///\{
228 
229  TokeniserError(const std::string & s, const std::string & msg) noexcept;
230 
231  ///\}
232  };
233 
234  template <typename DelimMode_>
235  template <typename Iter_>
236  void
238  const std::string & s,
239  const std::string & delims,
240  const std::string & quotes,
241  Iter_ iter)
242  {
243  typedef tokeniser_internals::Lexer Lexer;
244  Lexer l(s, delims, quotes);
245 
246  enum { s_initial, s_had_quote, s_had_text, s_had_quote_text, s_had_quote_text_quote } state = s_initial;
247 
248  while (l.next())
249  {
250  switch (state)
251  {
252  case s_initial:
253  switch (l.kind)
254  {
255  case Lexer::t_quote:
256  state = s_had_quote;
257  l.delims = "";
258  break;
259 
260  case Lexer::t_delim:
261  state = s_initial;
263  break;
264 
265  case Lexer::t_text:
266  state = s_had_text;
268  break;
269  }
270  break;
271 
272  case s_had_quote:
273  switch (l.kind)
274  {
275  case Lexer::t_quote:
276  state = s_had_quote_text_quote;
277  l.delims = delims;
279  break;
280 
281  case Lexer::t_delim:
282  throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote");
283  break;
284 
285  case Lexer::t_text:
286  state = s_had_quote_text;
288  break;
289  }
290  break;
291 
292  case s_had_quote_text:
293  switch (l.kind)
294  {
295  case Lexer::t_text:
296  throw InternalError(PALUDIS_HERE, "t_text in s_had_quote_text");
297  break;
298 
299  case Lexer::t_delim:
300  throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote_text");
301  break;
302 
303  case Lexer::t_quote:
304  state = s_had_quote_text_quote;
305  l.delims = delims;
306  break;
307  }
308  break;
309 
310  case s_had_quote_text_quote:
311  switch (l.kind)
312  {
313  case Lexer::t_text:
314  throw TokeniserError(s, "Close quote followed by text");
315  break;
316 
317  case Lexer::t_quote:
318  throw TokeniserError(s, "Close quote followed by quote");
319  break;
320 
321  case Lexer::t_delim:
322  state = s_initial;
324  break;
325  }
326  break;
327 
328  case s_had_text:
329  switch (l.kind)
330  {
331  case Lexer::t_text:
332  throw InternalError(PALUDIS_HERE, "t_text in s_had_text");
333  break;
334 
335  case Lexer::t_quote:
336  throw TokeniserError(s, "Text followed by quote");
337  break;
338 
339  case Lexer::t_delim:
340  state = s_initial;
342  break;
343  }
344  break;
345  }
346  }
347 
348  switch (state)
349  {
350  case s_initial:
351  case s_had_text:
352  case s_had_quote_text_quote:
353  return;
354 
355  case s_had_quote:
356  case s_had_quote_text:
357  throw TokeniserError(s, "Unterminated quoted string");
358  }
359  }
360 
361  /**
362  * Tokenise a string.
363  *
364  * \ingroup g_strings
365  * \since 0.26
366  */
367  template <typename DelimKind_, typename DelimMode_, typename Iter_>
368  void tokenise(const std::string & s, const std::string & delims, const std::string & quotes, Iter_ iter)
369  {
370  tokeniser_internals::Tokeniser<DelimKind_, DelimMode_>::template tokenise<Iter_>(s, delims, quotes, iter);
371  }
372 
373  /**
374  * Convenience function: tokenise on whitespace.
375  *
376  * \ingroup g_strings
377  * \since 0.26
378  */
379  template <typename Iter_>
380  void tokenise_whitespace(const std::string & s, Iter_ iter)
381  {
382  tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "", iter);
383  }
384 
385  /**
386  * Convenience function: tokenise on whitespace, handling quoted strings.
387  *
388  * \ingroup g_strings
389  * \since 0.26
390  */
391  template <typename Iter_>
392  void tokenise_whitespace_quoted(const std::string &s, Iter_ iter)
393  {
394  tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "'\"", iter);
395  }
396 }
397 
398 #endif
Definition: about_metadata-fwd.hh:23
#define PALUDIS_HERE
Definition: exception.hh:216
static void handle_delim(const std::string &, const Iter_ &)
Definition: tokeniser.hh:112
Definition: tokeniser.hh:90
Definition: exception.hh:131
void tokenise_whitespace_quoted(const std::string &s, Iter_ iter)
Definition: tokeniser.hh:392
void tokenise_whitespace(const std::string &s, Iter_ iter)
Definition: tokeniser.hh:380
static void handle_delim(const std::string &s, Iter_ &i)
Definition: tokeniser.hh:137
static void handle_token(const std::string &s, Iter_ &i)
Definition: tokeniser.hh:104
Definition: exception.hh:74
Definition: tokeniser.hh:222
Definition: tokeniser.hh:199
void tokenise(const std::string &s, const std::string &delims, const std::string &quotes, Iter_ iter)
Definition: tokeniser.hh:368
static void handle_token(const std::string &s, Iter_ &i)
Definition: tokeniser.hh:129
Definition: tokeniser.hh:143
#define PALUDIS_VISIBLE
Definition: attributes.hh:59