1 /// Contains a pull parser for command line arguments.
2 module jaster.cli.parser;
3 
4 private
5 {
6     import std.typecons : Flag;
7 }
8 
9 /// What type of data an `ArgToken` stores.
10 enum ArgTokenType
11 {
12     /// None. If this ever gets returned by the `ArgPullParser`, it's an error.
13     None,
14     
15     /// Plain text. Note that these values usually do have some kind of meaning (e.g. the value of a named argument) but it's
16     /// too inaccurate for the parser to determine their meanings. So it's up to whatever is using the parser.
17     Text,
18 
19     /// The name of a short hand argument ('-h', '-c', etc.) $(B without) the leading '-'.
20     ShortHandArgument,
21 
22     /// The name of a long hand argument ('--help', '--config', etc.) $(B without) the leading '--'.
23     LongHandArgument,
24     
25     /// End of file/input.
26     EOF
27 }
28 
29 /// Contains information about a token.
30 struct ArgToken
31 {
32     /// The value making up the token.
33     string value;
34 
35     /// The type of data this token represents.
36     ArgTokenType type;
37 }
38 
39 /++
40  + A pull parser for command line arguments.
41  +
42  + Notes:
43  +  The input is given as a `string[]`. This mostly only matters for `ArgTokenType.Text` values.
44  +  This is because the parser does not split up plain text by spaces like a shell would.
45  +
46  +  e.g. There will be different results between `ArgPullParser(["env set OAUTH_SECRET 29ef"])` and
47  +  `ArgPullParser(["env", "set", "OAUTH_SECRET", "29ef"])`
48  +
49  +  The former is given back as a single token containing the entire string. The latter will return 4 tokens, containing the individual strings.
50  +
51  +  This behaviour is used because this parser is designed to take its input directly from the main function's args, which have already been
52  +  processed by a shell.
53  +
54  + Argument Formats:
55  +  The following named argument formats are supported.
56  +
57  +  '-n'         - Shorthand with no argument. (returns `ArgTokenTypes.ShortHandArgument`)
58  +  '-n ARG'     - Shorthand with argument. (`ArgTokenTypes.ShortHandArgument` and `ArgTokenTypes.Text`)
59  +  '-n=ARG'     - Shorthand with argument with an equals sign. The equals sign is removed from the token output. (`ArgTokenTypes.ShortHandArgument` and `ArgTokenTypes.Text`)
60  +  '-nARG       - Shorthand with argument with no space between them. (`ArgTokenTypes.ShortHandArgument` and `ArgTokenTypes.Text`)
61  +
62  +  '--name'     - Longhand with no argument.
63  +  '--name ARG' - Longhand with argument.
64  +  '--name=ARG' - Longhand with argument with an equals sign. The equals sign is removed from the token output.
65  + ++/
66 @safe
67 struct ArgPullParser
68 {
69     /// Variables ///
70     private
71     {
72         alias OrEqualSign = Flag!"equalSign";
73         alias OrSpace = Flag!"space";
74 
75         string[] _args;
76         size_t   _currentArgIndex;  // Current index into _args.
77         size_t   _currentCharIndex; // Current index into the current arg.
78         ArgToken _currentToken = ArgToken(null, ArgTokenType.EOF);
79     }
80     
81     /++
82      + Params:
83      +  args = The arguments to parse. Please see the 'notes' section for `ArgPullParser`.
84      + ++/
85     this(string[] args)
86     {
87         this._args = args;
88         this.popFront();
89     }
90 
91     /// Range interface ///
92     public
93     {
94         /// Parses the next token.
95         void popFront()
96         {
97             this.nextToken();
98         }
99 
100         /// Returns: the last parsed token.
101         ArgToken front()
102         {
103             return this._currentToken;
104         }
105 
106         /// Returns: Whether there's no more characters to parse.
107         bool empty()
108         {
109             return this._currentToken.type == ArgTokenType.EOF;
110         }
111         
112         /// Returns: A copy of the pull parser in it's current state.
113         ArgPullParser save()
114         {
115             ArgPullParser parser;
116             parser._args             = this._args;
117             parser._currentArgIndex  = this._currentArgIndex;
118             parser._currentCharIndex = this._currentCharIndex;
119             parser._currentToken     = this._currentToken;
120 
121             return parser;
122         }
123 
124         /// Returns: The args that have yet to be parsed.
125         @property
126         string[] unparsedArgs()
127         {
128             return (this._currentArgIndex + 1 < this._args.length)
129                    ? this._args[this._currentArgIndex + 1..$]
130                    : null;
131         }
132     }
133 
134     /// Parsing ///
135     private
136     {
137         @property
138         string currentArg()
139         {
140             return this._args[this._currentArgIndex];
141         }
142 
143         @property
144         string currentArgSlice()
145         {
146             return this.currentArg[this._currentCharIndex..$];
147         }
148 
149         void skipWhitespace()
150         {
151             import std.ascii : isWhite;
152 
153             if(this._currentArgIndex >= this._args.length)
154                 return;
155 
156             // Current arg could be empty, so get next arg.
157             // *Next* arg could also be empty, so repeat until we either run out of args, or we find a non-empty one.
158             while(this.currentArgSlice.length == 0)
159             {
160                 this.nextArg();
161 
162                 if(this._currentArgIndex >= this._args.length)
163                     return;
164             }
165 
166             auto arg = this.currentArg;
167             while(arg[this._currentCharIndex].isWhite)
168             {
169                 this._currentCharIndex++;
170                 if(this._currentCharIndex >= arg.length)
171                 {
172                     // Next arg might start with whitespace, so we have to keep going.
173                     // We recursively call this function so we don't have to copy the empty-check logic at the start of this function.
174                     this.nextArg();
175                     return this.skipWhitespace();
176                 }
177             }
178         }
179 
180         string readToEnd(OrSpace orSpace = OrSpace.no, OrEqualSign orEqualSign = OrEqualSign.no)
181         {
182             import std.ascii : isWhite;
183 
184             this.skipWhitespace();
185             if(this._currentArgIndex >= this._args.length)
186                 return null;
187 
188             // Small optimisation: If we're at the very start, and we only need to read until the end, then just
189             // return the entire arg.
190             if(this._currentCharIndex == 0 && !orSpace && !orEqualSign)
191             {
192                 auto arg = this.currentArg;
193 
194                 // Force skipWhitespace to call nextArg on its next call.
195                 // We can't call nextArg here, as it breaks assumptions that unparsedArgs relies on.
196                 this._currentCharIndex = this.currentArg.length;
197                 return arg;
198             }
199             
200             auto slice = this.currentArgSlice;
201             size_t end = 0;
202             while(end < slice.length)
203             {
204                 if((slice[end].isWhite && orSpace)
205                 || (slice[end] == '=' && orEqualSign)
206                 )
207                 {
208                     break;
209                 }
210 
211                 end++;
212                 this._currentCharIndex++;
213             }
214 
215             // Skip over whatever char we ended up on.
216             // This is mostly to skip over the '=' sign if we're using that, but also saves 'skipWhitespace' a bit of hassle.
217             if(end < slice.length)
218                 this._currentCharIndex++;
219 
220             return slice[0..end];
221         }
222 
223         void nextArg()
224         {
225             this._currentArgIndex++;
226             this._currentCharIndex = 0;
227         }
228 
229         void nextToken()
230         {
231             import std.exception : enforce;
232 
233             this.skipWhitespace();
234             if(this._currentArgIndex >= this._args.length)
235             {
236                 this._currentToken = ArgToken("", ArgTokenType.EOF);
237                 return;
238             }
239 
240             auto slice = this.currentArgSlice;
241             if(slice.length >= 2 && slice[0..2] == "--")
242             {
243                 this._currentCharIndex += 2;
244 
245                 // Edge case: Since readToEnd can advance the "currentArgSlice", we get into this common situation
246                 //            of ["--", "b"] where this should be an unnamed long hand arg followed by the text "b", but
247                 //            instead it gets treated as "--b", which we don't want. So we're just checking for this here.
248                 if(this._currentCharIndex >= this.currentArg.length || this.currentArg[this._currentCharIndex] == ' ')
249                     this._currentToken = ArgToken("", ArgTokenType.LongHandArgument);
250                 else
251                     this._currentToken = ArgToken(this.readToEnd(OrSpace.yes, OrEqualSign.yes), ArgTokenType.LongHandArgument);
252                 return;
253             }
254             else if(slice.length >= 1 && slice[0] == '-')
255             {
256                 this._currentCharIndex += (slice.length == 1) ? 1 : 2; // += 2 so we skip over the arg name.
257                 this._currentToken = ArgToken((slice.length == 1) ? "" : slice[1..2], ArgTokenType.ShortHandArgument);
258 
259                 // Skip over the equals sign if there is one.
260                 if(this._currentCharIndex < this.currentArg.length
261                 && this.currentArg[this._currentCharIndex] == '=')
262                     this._currentCharIndex++;
263 
264                 // If it's unnamed, then sometimes the "name" can be a space, so we'll just handle that here
265                 if(this._currentToken.value == " ")
266                     this._currentToken.value = null;
267 
268                 return;
269             }
270             else if(slice.length != 0)
271             {
272                 this._currentToken = ArgToken(this.readToEnd(), ArgTokenType.Text);
273                 return;
274             }
275             
276             assert(false, "EOF should've been returned. SkipWhitespace might not be working.");
277         }
278     }
279 }
280 ///
281 @safe
282 unittest
283 {
284     import std.array : array;
285 
286     auto args = 
287     [
288         // Some plain text.
289         "env", "set", 
290         
291         // Long hand named arguments.
292         "--config=MyConfig.json", "--config MyConfig.json",
293 
294         // Short hand named arguments.
295         "-cMyConfig.json", "-c=MyConfig.json", "-c MyConfig.json",
296 
297         // Simple example to prove that you don't need the arg name and value in the same string.
298         "-c", "MyConfig.json",
299 
300         // Plain text.
301         "Some Positional Argument",
302 
303         // Raw Nameless named args
304         "- a", "-", "a",
305         "-- a", "--", "a"
306     ];
307     auto tokens = ArgPullParser(args).array;
308 
309     // import std.stdio;
310     // writeln(tokens);
311 
312     // Plain text.
313     assert(tokens[0]  == ArgToken("env",                         ArgTokenType.Text));
314     assert(tokens[1]  == ArgToken("set",                         ArgTokenType.Text));
315 
316     // Long hand named arguments.
317     assert(tokens[2]  == ArgToken("config",                      ArgTokenType.LongHandArgument));
318     assert(tokens[3]  == ArgToken("MyConfig.json",               ArgTokenType.Text));
319     assert(tokens[4]  == ArgToken("config",                      ArgTokenType.LongHandArgument));
320     assert(tokens[5]  == ArgToken("MyConfig.json",               ArgTokenType.Text));
321 
322     // Short hand named arguments.
323     assert(tokens[6]  == ArgToken("c",                           ArgTokenType.ShortHandArgument));
324     assert(tokens[7]  == ArgToken("MyConfig.json",               ArgTokenType.Text));
325     assert(tokens[8]  == ArgToken("c",                           ArgTokenType.ShortHandArgument));
326     assert(tokens[9]  == ArgToken("MyConfig.json",               ArgTokenType.Text));
327     assert(tokens[10] == ArgToken("c",                           ArgTokenType.ShortHandArgument));
328     assert(tokens[11] == ArgToken("MyConfig.json",               ArgTokenType.Text));
329     assert(tokens[12] == ArgToken("c",                           ArgTokenType.ShortHandArgument));
330     assert(tokens[13] == ArgToken("MyConfig.json",               ArgTokenType.Text));
331 
332     // Plain text.
333     assert(tokens[14] == ArgToken("Some Positional Argument",    ArgTokenType.Text));
334 
335     // Raw Nameless named args.
336     assert(tokens[15] == ArgToken("", ArgTokenType.ShortHandArgument));
337     assert(tokens[16] == ArgToken("a", ArgTokenType.Text));
338     assert(tokens[17] == ArgToken("", ArgTokenType.ShortHandArgument));
339     assert(tokens[18] == ArgToken("a", ArgTokenType.Text));
340     assert(tokens[19] == ArgToken("", ArgTokenType.LongHandArgument));
341     assert(tokens[20] == ArgToken("a", ArgTokenType.Text));
342     assert(tokens[21] == ArgToken("", ArgTokenType.LongHandArgument));
343     assert(tokens[22] == ArgToken("a", ArgTokenType.Text));
344 }
345 
346 @("Issue: .init.empty must be true")
347 @safe
348 unittest
349 {
350     assert(ArgPullParser.init.empty);
351 }
352 
353 @("Test unparsedArgs")
354 @safe
355 unittest
356 {
357     auto args = 
358     [
359         "one", "-t", "--three", "--unfortunate=edgeCase" // Despite this containing two tokens, they currently both get skipped over, even only one was parsed so far ;/
360     ];
361     auto parser = ArgPullParser(args);
362 
363     assert(parser.unparsedArgs == args[1..$]);
364     foreach(i; 0..3)
365     {
366         parser.popFront();
367         assert(parser.unparsedArgs == args[2 + i..$]);
368     }
369 
370     assert(parser.unparsedArgs is null);
371 }