1 module jcli.argparser.tokenizer;
2 
3 import std.range;
4 
5 struct ArgToken 
6 {
7     enum Kind
8     {
9         /// Uninitialized.
10         none = 0,
11         /// 2 dashes have a special meaning of a delimiter.
12         twoDashesDelimiter = 1,
13         /// Arguments that appear after any named arguments.
14         // rawText,
15 
16         /// The bit! Used to check if it's the argument name.
17         argumentNameBit = 16,
18         /// Example: --stuff
19         fullNamedArgumentName = argumentNameBit | 1,
20         /// Example: -a
21         shortNamedArgumentName = argumentNameBit | 2,
22 
23         /// The bit! Indicating whether it contains a value.
24         valueBit = 32,
25         /// The bit indicating the argument value may correspond to a named argument
26         namedArgumentValueBit = 1,
27         /// The bit indicating the argument value may correspond to a positional argument
28         positionalArgumentBit = 2,
29         /// The bit indicating the argument value may correspond to an orphan argument.
30         /// We call an argument orphan when it appears after a named argument name.
31         /// Example: not_orphan -arg_name maybe_orphan definitely_orphan
32         orphanArgumentBit = 4,
33         /// The right part of -a=b, -a="b", --stuff=b or --stuff="b".
34         namedArgumentValue = valueBit | namedArgumentValueBit,
35         /// --stuff value
36         namedArgumentValueOrOrphanArgument = valueBit | namedArgumentValueBit | orphanArgumentBit,
37         /// Arguments that appear before any named arguments.
38         /// value --stuff not_this_one
39         positionalArgument = valueBit | positionalArgumentBit,
40         /// 
41         orphanArgument = valueBit | orphanArgumentBit,
42         
43         /// The bit! indicating that an error has occured.
44         errorBit = 64,
45         /// 3 dashes are ambiguous and are not allowed.
46         error_threeOrMoreDashes = errorBit | 1,
47         /// Lonely dash not allowed. (Should it be parsed as positional instead??)
48         error_singleDash = errorBit | 2,
49         /// `--arg="` causes this error.
50         error_malformedQuotes = errorBit | 3,
51         /// `--arg=` causes this error.
52         error_noValueForNamedArgument = errorBit | 4,
53         /// `--arg="...` causes this error.
54         error_unclosedQuotes = errorBit | 5,
55         /// `--arg="..."...` causes this error.
56         error_inputAfterClosedQuote = errorBit | 6,
57         /** 
58             `--arg= `
59             
60             Can happen in a situation, when a user invokes a command like this:
61             --name "--arg= "
62             
63             Which the program sees like this:
64             ["--name", "--arg= "]
65             
66             So it assumes "--name" is a flag, and "--arg" is the name of the next argument,
67             while in fact the "--arg=" part is an argument to the previous command.
68             In this situation we emit this error, which you should fix with `--name="--args"`.
69             
70             However, the situation `--name "--arg"` cannot be physically accounted for,
71             so in that case we emit this error in the binder, which has semantic info.
72         */
73         error_spaceAfterAssignment = errorBit | 7,
74         /// ditto
75         error_spaceAfterDashes = errorBit | 8,
76     }
77 
78     this(Kind kind, string fullSlice, string valueSlice) @safe pure @nogc nothrow
79     {
80         this.kind = kind;
81         this.fullSlice = fullSlice;
82         this.valueSlice = valueSlice;
83     }
84 
85     Kind kind;
86     string fullSlice;
87 
88     union
89     {
90         string valueSlice;
91         string nameSlice;
92     }
93 }
94 
95 // for now, scoped to module, but we may want it to be public, it is useful.
96 package (jcli)
97 {
98     import jcli.core.utils : FlagsHelpers;
99     mixin FlagsHelpers!(ArgToken.Kind);
100 }
101 
102 struct ArgTokenizer(TRange)
103     if (isInputRange!TRange 
104         && is(ElementType!TRange == string))
105 {
106     private
107     {
108         TRange _range;
109         bool _empty = false;
110         ArgToken _front = ArgToken.init;
111         size_t _positionWithinCurrentString = 0;
112     }
113 
114     @safe pure @nogc:
115 
116     /// Resets the internal state, such that the orphan arguments become considered positional again.
117     void resetWithRemainingRange()
118     {
119         alias Kind = ArgToken.Kind;
120         if (_front.kind == Kind.namedArgumentValue)
121             popFront();
122         if (_front.kind.has(Kind.orphanArgument))
123             _front.kind = Kind.positionalArgument;
124     }
125     
126     ArgToken front() const nothrow pure @safe
127     {
128         assert(!_empty);
129         return _front;
130     }
131 
132     bool empty() const nothrow pure @safe 
133     {
134         return _empty;
135     }
136 
137     /// NOTE: this property does not take into account the position within the string.
138     inout(TRange) leftoverRange() inout nothrow pure @safe
139     {
140         return _range;
141     }
142 
143     /// This function may throw if the characters of argument values are not valid utf8 characters.
144     /// This function fails in debug if the passed arguments are not properly shell escaped.
145     /// This function assumes that all option names are valid ascii symbols.
146     void popFront()
147     {
148         if (_range.empty)
149         {
150             assert(!_empty);
151             _empty = true;
152             return;
153         }
154         _front = _popFrontInternal();
155     }
156 
157     /// ditto
158     private ArgToken _popFrontInternal()
159     {
160         assert(!empty);
161 
162         const currentSlice = _range.front;
163         const initialPosition = _positionWithinCurrentString;
164         string getCurrentFullSlice()
165         {
166             return currentSlice[initialPosition .. _positionWithinCurrentString];
167         }
168         char getCurrentCharacter()
169         {
170             return currentSlice[_positionWithinCurrentString];
171         }
172 
173         void popFrontAndReset()
174         {
175             _range.popFront();
176             _positionWithinCurrentString = 0;
177 
178             // string newCurrent = _range.front;
179             // size_t currentIndex = 0;
180             // while (currentIndex < newCurrent.length
181             //     && isWhite(newCurrent[currentIndex]))
182             // {
183             //     currentIndex++;
184             // }
185             // // Either it's a zero length argument, or it's all whitespace.
186             // // We will treat it as is.
187             // // The variable we set indicates that the next popFront should
188             // // return the entire thing and just skip until the next one.
189             // if (currentIndex == newCurrent.length)
190             // {
191             //     _isNextArgumentEmptyOrWhitespace = true;
192             // }
193             // else
194             // {
195             //     _positionWithinCurrentString = currentIndex;
196             // }
197         }
198 
199         alias Kind = ArgToken.Kind;
200         Kind previousKind = _front.kind;
201 
202         ArgToken parseArgumentName()
203         {
204             // This function assumes the current character is a dash
205             // Note to devs: if you want the logic after that, extract another local function.
206             assert(getCurrentCharacter() == '-');
207             _positionWithinCurrentString++;
208 
209             Kind potentialNamedArgumentKind;
210             // A lonely dash without a name is not allowed.
211             if (currentSlice.length == _positionWithinCurrentString)
212             {
213                 const fullSlice  = getCurrentFullSlice();
214                 const valueSlice = "";
215                 popFrontAndReset();
216                 return ArgToken(Kind.error_singleDash, fullSlice, valueSlice);
217             }
218             // Double dash.
219             else if (getCurrentCharacter() == '-')
220             {
221                 potentialNamedArgumentKind = Kind.fullNamedArgumentName;
222                 _positionWithinCurrentString++;
223             }
224             // Shorthand argument.
225             else
226             {
227                 potentialNamedArgumentKind = Kind.shortNamedArgumentName;
228             }
229 
230             // Two dashes without name following them mean the delimiter.
231             if (_positionWithinCurrentString == currentSlice.length)
232             {
233                 const fullSlice  = getCurrentFullSlice();
234                 const valueSlice = fullSlice;
235                 popFrontAndReset();
236                 return ArgToken(Kind.twoDashesDelimiter, fullSlice, valueSlice);
237             }
238 
239             // If there is a space, at that point it must have been split already.
240             // See `Kind.error_spaceAfterDashes`.
241             if (getCurrentCharacter() == ' ')
242             {
243                 // "The arguments must be shell escaped prior to sending them to the parser.");
244                 const kind       = Kind.error_spaceAfterDashes;
245                 const fullSlice  = currentSlice[_positionWithinCurrentString .. $];
246                 const valueSlice = fullSlice;
247                 popFrontAndReset();
248                 return ArgToken(kind, fullSlice, valueSlice);
249             }
250 
251             if (getCurrentCharacter() == '-')
252             {
253                 _positionWithinCurrentString++;
254                 const kind       = Kind.error_threeOrMoreDashes;
255                 const fullSlice  = getCurrentFullSlice();
256                 const valueSlice = fullSlice;
257                 popFrontAndReset();
258                 return ArgToken(kind, fullSlice, valueSlice);
259             }
260 
261             // Even though in the struct definition it is called "value slice",
262             // I figured "name slice" in this context makes more sense, because
263             // we're parsing an option name.
264             const nameStartPosition = _positionWithinCurrentString;
265             string getCurrentNameSlice()
266             {
267                 return currentSlice[nameStartPosition .. _positionWithinCurrentString];
268             }
269 
270             while (_positionWithinCurrentString < currentSlice.length)
271             {
272                 char ch = getCurrentCharacter();
273                 if (ch == '=')
274                 {
275                     const fullSlice = getCurrentFullSlice();
276                     const nameSlice = getCurrentNameSlice();
277                     _positionWithinCurrentString++;
278                     return ArgToken(potentialNamedArgumentKind, fullSlice, nameSlice);
279                 }
280                 import std.ascii : isWhite;
281                 if (!isWhite(ch))
282                 {
283                     _positionWithinCurrentString++;
284                     continue;
285                 }
286                 break;
287             }
288 
289             {
290                 const fullSlice = getCurrentFullSlice();
291                 const nameSlice = getCurrentNameSlice();
292                 
293                 if (_positionWithinCurrentString == currentSlice.length)
294                 {
295                     popFrontAndReset();
296                 }
297 
298                 return ArgToken(potentialNamedArgumentKind, fullSlice, nameSlice);
299             }
300         }
301 
302         if (previousKind & Kind.argumentNameBit)
303         {
304             // If the position is not 0, that means we're taking off after an option
305             // has been specified and we're on the other side of '='.
306             const bool isRHSOfEqual = _positionWithinCurrentString > 0;
307 
308             // We must always parse the value that follows as a value literal, allowing any characters.
309             // For simplicity, let's say we only allow quoting with "" and not with ^^ or any other nonsense.
310             if (isRHSOfEqual)
311             {
312                 // `--arg=`
313                 if (_positionWithinCurrentString == currentSlice.length)
314                 {
315                     const kind       = Kind.error_noValueForNamedArgument;
316                     // We have the possibility to put more info here, if needed.
317                     const fullSlice  = "";
318                     const valueSlice = "";
319                     popFrontAndReset();
320                     return ArgToken(kind, fullSlice, valueSlice);
321                 }
322 
323                 if (getCurrentCharacter() == '"')
324                 {
325                     _positionWithinCurrentString++;
326 
327                     // `--arg="`
328                     if (_positionWithinCurrentString == currentSlice.length)
329                     {
330                         const kind       = Kind.error_malformedQuotes;
331                         const fullSlice  = getCurrentFullSlice();
332                         const valueSlice = fullSlice;
333                         popFrontAndReset();
334                         return ArgToken(kind, fullSlice, valueSlice);
335                     }
336 
337                     const valueStartIndex = _positionWithinCurrentString;
338 
339                     // At this point we might as well use the phobos indexOf funicton, because this part
340                     // might have non-ascii characters so comparing bytes is just wrong. 
341                     import std.string : indexOf;
342                     auto indexOfQuote = indexOf(currentSlice[valueStartIndex .. $], '"');
343 
344                     // `--arg="...`
345                     if (indexOfQuote == -1)
346                     {
347                         const kind       = Kind.error_unclosedQuotes;
348                         const fullSlice  = currentSlice[initialPosition .. $];
349                         const valueSlice = currentSlice[valueStartIndex .. $];
350                         popFrontAndReset();
351                         return ArgToken(kind, fullSlice, valueSlice);
352                     }
353 
354                     indexOfQuote += valueStartIndex;
355                     
356                     // `--arg="..."...`
357                     if (currentSlice.length != indexOfQuote + 1)
358                     {
359                         const kind       = Kind.error_inputAfterClosedQuote;
360                         const fullSlice  = currentSlice[initialPosition .. $];
361                         const valueSlice = currentSlice[valueStartIndex .. $];
362                         popFrontAndReset();
363                         return ArgToken(kind, fullSlice, valueSlice);
364                     }
365 
366                     // `--arg="..."`
367                     {
368                         const kind       = Kind.namedArgumentValue;
369                         const fullSlice  = currentSlice[initialPosition .. $];
370                         const valueSlice = currentSlice[valueStartIndex .. indexOfQuote];
371                         popFrontAndReset();
372                         return ArgToken(kind, fullSlice, valueSlice);
373                     }
374                 }
375 
376                 // `--arg=...`
377                 {
378                     const fullSlice  = currentSlice[initialPosition .. $];
379                     const valueSlice = currentSlice[initialPosition .. $];
380                     
381                     // We might want to display some more info here.
382                     const kind =
383                     (){
384                         import std.string : indexOf;
385                         const indexOfSpace = indexOf(valueSlice, ' ');
386                         if (indexOfSpace == -1)
387                         {
388                             return Kind.namedArgumentValue;
389                         }
390                         
391                         // If the spaces got into the string, it was malformed from the start,
392                         // or we have a rare edge case (see Kind.error_spaceAfterAssignment).
393                         return Kind.error_spaceAfterAssignment;
394                     }();
395 
396                     popFrontAndReset();
397                     return ArgToken(kind, fullSlice, valueSlice);
398                 }
399             }
400 
401             if (getCurrentCharacter() == '-')
402             {
403                 return parseArgumentName();
404             }
405 
406             // Otherwise the entire string is just an argument value like the "value" below.
407             // ["--name", "value"].
408             // We don't care whether it was quoted or not in the source, we just return the whole thing.
409             {
410                 const kind       = Kind.namedArgumentValueOrOrphanArgument;
411                 const fullSlice  = currentSlice;
412                 const valueSlice = currentSlice;
413                 _range.popFront();
414                 return ArgToken(kind, fullSlice, valueSlice);
415             }
416         }
417 
418         
419         // It is not a named arg (technically these checks are not needed, but let's do it just in case).
420         assert(
421             // Covers all special cases, like the first argument
422             previousKind < Kind.valueBit
423             || previousKind.hasEither(Kind.errorBit | Kind.valueBit));
424 
425         assert(_positionWithinCurrentString == 0, "??");
426 
427         if (getCurrentCharacter() == '-')
428         {
429             return parseArgumentName();
430         }
431 
432         {
433             // If it's not a named arg, then it's just a value like this
434             // --arg value
435             // or like this
436             // --arg "ba ba ba"
437             // We see it unqouted, so we just return the value
438             const kind = 
439             (){
440                 // No input yet
441                 if (previousKind == Kind.none)
442                     return Kind.positionalArgument;
443 
444                 // Say, if the input is malformatted, we consider everything after that orphans,
445                 // I guess this is pretty reasonable.
446                 if (previousKind.has(Kind.errorBit))
447                     return Kind.orphanArgument;
448 
449                 if (previousKind.has(Kind.argumentNameBit))
450                     return Kind.namedArgumentValueOrOrphanArgument;
451 
452                 // Just to be sure nothing went wrong.
453                 assert(previousKind.has(Kind.valueBit));
454 
455                 if (previousKind.hasEither(Kind.orphanArgumentBit | Kind.positionalArgumentBit))
456                 {
457                     // Copy the positional or the orphan bit of the previous argument.
458                     return previousKind & ~Kind.namedArgumentValueBit;
459                 }
460 
461                 assert(previousKind == Kind.namedArgumentValue);
462                 return Kind.orphanArgument;
463             }();
464             const fullSlice  = currentSlice;
465             const valueSlice = currentSlice;
466             _range.popFront();
467             return ArgToken(kind, fullSlice, valueSlice);
468         }
469     }
470 }
471 
472 ArgTokenizer!TRange argTokenizer(TRange)(TRange range)
473 {
474     auto result = ArgTokenizer!TRange(range);
475     result.popFront();
476     return result;
477 }
478 
479 unittest
480 {
481     import std.algorithm : equal;
482     alias Kind = ArgToken.Kind;
483     {
484         auto args = ["hello", "world"];
485         assert(equal(argTokenizer(args), [
486             ArgToken(Kind.positionalArgument, "hello", "hello"),
487             ArgToken(Kind.positionalArgument, "world", "world"),
488         ]));
489     }
490     {
491         auto args = ["--hello", "world"];
492         assert(equal(argTokenizer(args), [
493             ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"),
494             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"),
495         ]));
496     }
497     
498     {
499         auto args = ["-hello", "world"];
500         assert(equal(argTokenizer(args), [
501             ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"),
502             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"),
503         ]));
504     }
505     {
506         auto args = ["--hello", "world", "world2"];
507         assert(equal(argTokenizer(args), [
508             ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"),
509             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"),
510             ArgToken(Kind.orphanArgument, "world2", "world2"),
511         ]));
512     }
513     {
514         auto args = ["-hello=world"];
515         assert(equal(argTokenizer(args), [
516             ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"),
517             ArgToken(Kind.namedArgumentValue, "world", "world"),
518         ]));
519     }
520     {
521         auto args = [`--hello="world"`];
522         assert(equal(argTokenizer(args), [
523             ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"),
524             ArgToken(Kind.namedArgumentValue, `"world"`, "world"),
525         ]));
526     }
527     {
528         auto args = [`--hello="--world"`];
529         assert(equal(argTokenizer(args), [
530             ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"),
531             ArgToken(Kind.namedArgumentValue, `"--world"`, "--world"),
532         ]));
533     }
534     {
535         auto args = ["--"];
536         assert(equal(argTokenizer(args), [
537             ArgToken(Kind.twoDashesDelimiter, "--", "--"),
538         ]));
539     }
540     {
541         auto args = ["-"];
542         assert(equal(argTokenizer(args), [
543             ArgToken(Kind.error_singleDash, "-", ""),
544         ]));
545     }
546     {
547         auto args = ["---"];
548         assert(equal(argTokenizer(args), [
549             ArgToken(Kind.error_threeOrMoreDashes, "---", "---"),
550         ]));
551     }
552     {
553         auto args = [" "];
554         assert(equal(argTokenizer(args), [
555             ArgToken(Kind.positionalArgument, " ", " "),
556         ]));
557     }
558     {
559         auto args = ["--arg="];
560         assert(equal(argTokenizer(args), [
561             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
562             ArgToken(Kind.error_noValueForNamedArgument, "", ""),
563         ]));
564     }
565     {
566         auto args = [`--arg="`];
567         assert(equal(argTokenizer(args), [
568             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
569             ArgToken(Kind.error_malformedQuotes, `"`, `"`),
570         ]));
571     }
572     {
573         auto args = [`--arg="" stuff`];
574         assert(equal(argTokenizer(args), [
575             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
576             ArgToken(Kind.error_inputAfterClosedQuote, `"" stuff`, `" stuff`),
577         ]));
578     }
579     {
580         auto args = [`--arg="stuff`];
581         assert(equal(argTokenizer(args), [
582             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
583             ArgToken(Kind.error_unclosedQuotes, `"stuff`, "stuff"),
584         ]));
585     }
586     {
587         auto args = [`--arg= `];
588         assert(equal(argTokenizer(args), [
589             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
590             ArgToken(Kind.error_spaceAfterAssignment, " ", " "),
591         ]));
592     }
593     {
594         // --arg "--arg=stuff"
595         // is expected to parse as
596         // --arg --arg=stuff
597         auto args = [`--arg`, `--arg=stuff`];
598         assert(equal(argTokenizer(args), [
599             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
600             ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"),
601             ArgToken(Kind.namedArgumentValue, "stuff", "stuff"),
602         ]));
603     }
604     {
605         auto args = ["a", "--b", "c", "-d=e"];
606         assert(equal(argTokenizer(args), [
607             ArgToken(Kind.positionalArgument, "a", "a"),
608 
609             ArgToken(Kind.fullNamedArgumentName, "--b", "b"),
610             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "c", "c"),
611             
612             ArgToken(Kind.shortNamedArgumentName, "-d", "d"),
613             ArgToken(Kind.namedArgumentValue, "e", "e"),
614         ]));
615     }
616     {
617         auto args = ["--a", "Штука"];
618         assert(equal(argTokenizer(args), [
619             ArgToken(Kind.fullNamedArgumentName, "--a", "a"),
620             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "Штука", "Штука"),
621         ]));
622     }
623     {
624         auto args = [`--a="Штука"`];
625         assert(equal(argTokenizer(args), [
626             ArgToken(Kind.fullNamedArgumentName, "--a", "a"),
627             ArgToken(Kind.namedArgumentValue, `"Штука"`, "Штука"),
628         ]));
629     }
630     {
631         auto args = ["--a", "物事"];
632         assert(equal(argTokenizer(args), [
633             ArgToken(Kind.fullNamedArgumentName, "--a", "a"),
634             ArgToken(Kind.namedArgumentValueOrOrphanArgument, "物事", "物事"),
635         ]));
636     }
637     {
638         // A tricky bug. The orphan argument after a sure positional would only have `valueBit`.
639         auto args = ["test", "-hello=world", "abc"];
640         auto tokenizer = argTokenizer(args);
641         assert(equal(tokenizer, [
642             ArgToken(Kind.positionalArgument, "test", "test"),
643             ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"),
644             ArgToken(Kind.namedArgumentValue, "world", "world"),
645             ArgToken(Kind.orphanArgument, "abc", "abc"),
646         ]));
647     }
648     
649     // // Copy and paste around for debugging.
650 
651     // import std.stdio : writeln;
652     // import std.array : array;
653     
654     // auto p = argTokenizer(args);
655 
656     // writeln(p.front);
657     // writeln(p.front.valueSlice);
658     // // writeln(p._range.front[p._positionWithinCurrentString]);
659     // writeln(p._positionWithinCurrentString);
660     
661     // p.popFront();
662     
663     // auto a = p.front();
664     // writeln(a);
665     // writeln(a.fullSlice);
666     // writeln(a.nameSlice);
667     // writeln(a.kind);
668     // writeln(p._positionWithinCurrentString);
669 
670     // p.popFront();
671     // writeln(p.empty);
672 }
673