1 module jcli.argparser.tokenizer; 2 3 import std.range; 4 5 struct ArgToken 6 { 7 enum Kind 8 { 9 /// Uninitialized. 10 none = 0, 11 /// 2 dashes have a special meaning of a delimiter. 12 twoDashesDelimiter = 1, 13 /// Arguments that appear after any named arguments. 14 // rawText, 15 16 /// The bit! Used to check if it's the argument name. 17 argumentNameBit = 16, 18 /// Example: --stuff 19 fullNamedArgumentName = argumentNameBit | 1, 20 /// Example: -a 21 shortNamedArgumentName = argumentNameBit | 2, 22 23 /// The bit! Indicating whether it contains a value. 24 valueBit = 32, 25 /// The bit indicating the argument value may correspond to a named argument 26 namedArgumentValueBit = 1, 27 /// The bit indicating the argument value may correspond to a positional argument 28 positionalArgumentBit = 2, 29 /// The bit indicating the argument value may correspond to an orphan argument. 30 /// We call an argument orphan when it appears after a named argument name. 31 /// Example: not_orphan -arg_name maybe_orphan definitely_orphan 32 orphanArgumentBit = 4, 33 /// The right part of -a=b, -a="b", --stuff=b or --stuff="b". 34 namedArgumentValue = valueBit | namedArgumentValueBit, 35 /// --stuff value 36 namedArgumentValueOrOrphanArgument = valueBit | namedArgumentValueBit | orphanArgumentBit, 37 /// Arguments that appear before any named arguments. 38 /// value --stuff not_this_one 39 positionalArgument = valueBit | positionalArgumentBit, 40 /// 41 orphanArgument = valueBit | orphanArgumentBit, 42 43 /// The bit! indicating that an error has occured. 44 errorBit = 64, 45 /// 3 dashes are ambiguous and are not allowed. 46 error_threeOrMoreDashes = errorBit | 1, 47 /// Lonely dash not allowed. (Should it be parsed as positional instead??) 48 error_singleDash = errorBit | 2, 49 /// `--arg="` causes this error. 50 error_malformedQuotes = errorBit | 3, 51 /// `--arg=` causes this error. 52 error_noValueForNamedArgument = errorBit | 4, 53 /// `--arg="...` causes this error. 54 error_unclosedQuotes = errorBit | 5, 55 /// `--arg="..."...` causes this error. 56 error_inputAfterClosedQuote = errorBit | 6, 57 /** 58 `--arg= ` 59 60 Can happen in a situation, when a user invokes a command like this: 61 --name "--arg= " 62 63 Which the program sees like this: 64 ["--name", "--arg= "] 65 66 So it assumes "--name" is a flag, and "--arg" is the name of the next argument, 67 while in fact the "--arg=" part is an argument to the previous command. 68 In this situation we emit this error, which you should fix with `--name="--args"`. 69 70 However, the situation `--name "--arg"` cannot be physically accounted for, 71 so in that case we emit this error in the binder, which has semantic info. 72 */ 73 error_spaceAfterAssignment = errorBit | 7, 74 /// ditto 75 error_spaceAfterDashes = errorBit | 8, 76 } 77 78 this(Kind kind, string fullSlice, string valueSlice) @safe pure @nogc nothrow 79 { 80 this.kind = kind; 81 this.fullSlice = fullSlice; 82 this.valueSlice = valueSlice; 83 } 84 85 Kind kind; 86 string fullSlice; 87 88 union 89 { 90 string valueSlice; 91 string nameSlice; 92 } 93 } 94 95 // for now, scoped to module, but we may want it to be public, it is useful. 96 package (jcli) 97 { 98 import jcli.core.utils : FlagsHelpers; 99 mixin FlagsHelpers!(ArgToken.Kind); 100 } 101 102 struct ArgTokenizer(TRange) 103 if (isInputRange!TRange 104 && is(ElementType!TRange == string)) 105 { 106 private 107 { 108 TRange _range; 109 bool _empty = false; 110 ArgToken _front = ArgToken.init; 111 size_t _positionWithinCurrentString = 0; 112 } 113 114 @safe pure @nogc: 115 116 /// Resets the internal state, such that the orphan arguments become considered positional again. 117 void resetWithRemainingRange() 118 { 119 alias Kind = ArgToken.Kind; 120 if (_front.kind == Kind.namedArgumentValue) 121 popFront(); 122 if (_front.kind.has(Kind.orphanArgument)) 123 _front.kind = Kind.positionalArgument; 124 } 125 126 ArgToken front() const nothrow pure @safe 127 { 128 assert(!_empty); 129 return _front; 130 } 131 132 bool empty() const nothrow pure @safe 133 { 134 return _empty; 135 } 136 137 /// NOTE: this property does not take into account the position within the string. 138 inout(TRange) leftoverRange() inout nothrow pure @safe 139 { 140 return _range; 141 } 142 143 /// This function may throw if the characters of argument values are not valid utf8 characters. 144 /// This function fails in debug if the passed arguments are not properly shell escaped. 145 /// This function assumes that all option names are valid ascii symbols. 146 void popFront() 147 { 148 if (_range.empty) 149 { 150 assert(!_empty); 151 _empty = true; 152 return; 153 } 154 _front = _popFrontInternal(); 155 } 156 157 /// ditto 158 private ArgToken _popFrontInternal() 159 { 160 assert(!empty); 161 162 const currentSlice = _range.front; 163 const initialPosition = _positionWithinCurrentString; 164 string getCurrentFullSlice() 165 { 166 return currentSlice[initialPosition .. _positionWithinCurrentString]; 167 } 168 char getCurrentCharacter() 169 { 170 return currentSlice[_positionWithinCurrentString]; 171 } 172 173 void popFrontAndReset() 174 { 175 _range.popFront(); 176 _positionWithinCurrentString = 0; 177 178 // string newCurrent = _range.front; 179 // size_t currentIndex = 0; 180 // while (currentIndex < newCurrent.length 181 // && isWhite(newCurrent[currentIndex])) 182 // { 183 // currentIndex++; 184 // } 185 // // Either it's a zero length argument, or it's all whitespace. 186 // // We will treat it as is. 187 // // The variable we set indicates that the next popFront should 188 // // return the entire thing and just skip until the next one. 189 // if (currentIndex == newCurrent.length) 190 // { 191 // _isNextArgumentEmptyOrWhitespace = true; 192 // } 193 // else 194 // { 195 // _positionWithinCurrentString = currentIndex; 196 // } 197 } 198 199 alias Kind = ArgToken.Kind; 200 Kind previousKind = _front.kind; 201 202 ArgToken parseArgumentName() 203 { 204 // This function assumes the current character is a dash 205 // Note to devs: if you want the logic after that, extract another local function. 206 assert(getCurrentCharacter() == '-'); 207 _positionWithinCurrentString++; 208 209 Kind potentialNamedArgumentKind; 210 // A lonely dash without a name is not allowed. 211 if (currentSlice.length == _positionWithinCurrentString) 212 { 213 const fullSlice = getCurrentFullSlice(); 214 const valueSlice = ""; 215 popFrontAndReset(); 216 return ArgToken(Kind.error_singleDash, fullSlice, valueSlice); 217 } 218 // Double dash. 219 else if (getCurrentCharacter() == '-') 220 { 221 potentialNamedArgumentKind = Kind.fullNamedArgumentName; 222 _positionWithinCurrentString++; 223 } 224 // Shorthand argument. 225 else 226 { 227 potentialNamedArgumentKind = Kind.shortNamedArgumentName; 228 } 229 230 // Two dashes without name following them mean the delimiter. 231 if (_positionWithinCurrentString == currentSlice.length) 232 { 233 const fullSlice = getCurrentFullSlice(); 234 const valueSlice = fullSlice; 235 popFrontAndReset(); 236 return ArgToken(Kind.twoDashesDelimiter, fullSlice, valueSlice); 237 } 238 239 // If there is a space, at that point it must have been split already. 240 // See `Kind.error_spaceAfterDashes`. 241 if (getCurrentCharacter() == ' ') 242 { 243 // "The arguments must be shell escaped prior to sending them to the parser."); 244 const kind = Kind.error_spaceAfterDashes; 245 const fullSlice = currentSlice[_positionWithinCurrentString .. $]; 246 const valueSlice = fullSlice; 247 popFrontAndReset(); 248 return ArgToken(kind, fullSlice, valueSlice); 249 } 250 251 if (getCurrentCharacter() == '-') 252 { 253 _positionWithinCurrentString++; 254 const kind = Kind.error_threeOrMoreDashes; 255 const fullSlice = getCurrentFullSlice(); 256 const valueSlice = fullSlice; 257 popFrontAndReset(); 258 return ArgToken(kind, fullSlice, valueSlice); 259 } 260 261 // Even though in the struct definition it is called "value slice", 262 // I figured "name slice" in this context makes more sense, because 263 // we're parsing an option name. 264 const nameStartPosition = _positionWithinCurrentString; 265 string getCurrentNameSlice() 266 { 267 return currentSlice[nameStartPosition .. _positionWithinCurrentString]; 268 } 269 270 while (_positionWithinCurrentString < currentSlice.length) 271 { 272 char ch = getCurrentCharacter(); 273 if (ch == '=') 274 { 275 const fullSlice = getCurrentFullSlice(); 276 const nameSlice = getCurrentNameSlice(); 277 _positionWithinCurrentString++; 278 return ArgToken(potentialNamedArgumentKind, fullSlice, nameSlice); 279 } 280 import std.ascii : isWhite; 281 if (!isWhite(ch)) 282 { 283 _positionWithinCurrentString++; 284 continue; 285 } 286 break; 287 } 288 289 { 290 const fullSlice = getCurrentFullSlice(); 291 const nameSlice = getCurrentNameSlice(); 292 293 if (_positionWithinCurrentString == currentSlice.length) 294 { 295 popFrontAndReset(); 296 } 297 298 return ArgToken(potentialNamedArgumentKind, fullSlice, nameSlice); 299 } 300 } 301 302 if (previousKind & Kind.argumentNameBit) 303 { 304 // If the position is not 0, that means we're taking off after an option 305 // has been specified and we're on the other side of '='. 306 const bool isRHSOfEqual = _positionWithinCurrentString > 0; 307 308 // We must always parse the value that follows as a value literal, allowing any characters. 309 // For simplicity, let's say we only allow quoting with "" and not with ^^ or any other nonsense. 310 if (isRHSOfEqual) 311 { 312 // `--arg=` 313 if (_positionWithinCurrentString == currentSlice.length) 314 { 315 const kind = Kind.error_noValueForNamedArgument; 316 // We have the possibility to put more info here, if needed. 317 const fullSlice = ""; 318 const valueSlice = ""; 319 popFrontAndReset(); 320 return ArgToken(kind, fullSlice, valueSlice); 321 } 322 323 if (getCurrentCharacter() == '"') 324 { 325 _positionWithinCurrentString++; 326 327 // `--arg="` 328 if (_positionWithinCurrentString == currentSlice.length) 329 { 330 const kind = Kind.error_malformedQuotes; 331 const fullSlice = getCurrentFullSlice(); 332 const valueSlice = fullSlice; 333 popFrontAndReset(); 334 return ArgToken(kind, fullSlice, valueSlice); 335 } 336 337 const valueStartIndex = _positionWithinCurrentString; 338 339 // At this point we might as well use the phobos indexOf funicton, because this part 340 // might have non-ascii characters so comparing bytes is just wrong. 341 import std.string : indexOf; 342 auto indexOfQuote = indexOf(currentSlice[valueStartIndex .. $], '"'); 343 344 // `--arg="...` 345 if (indexOfQuote == -1) 346 { 347 const kind = Kind.error_unclosedQuotes; 348 const fullSlice = currentSlice[initialPosition .. $]; 349 const valueSlice = currentSlice[valueStartIndex .. $]; 350 popFrontAndReset(); 351 return ArgToken(kind, fullSlice, valueSlice); 352 } 353 354 indexOfQuote += valueStartIndex; 355 356 // `--arg="..."...` 357 if (currentSlice.length != indexOfQuote + 1) 358 { 359 const kind = Kind.error_inputAfterClosedQuote; 360 const fullSlice = currentSlice[initialPosition .. $]; 361 const valueSlice = currentSlice[valueStartIndex .. $]; 362 popFrontAndReset(); 363 return ArgToken(kind, fullSlice, valueSlice); 364 } 365 366 // `--arg="..."` 367 { 368 const kind = Kind.namedArgumentValue; 369 const fullSlice = currentSlice[initialPosition .. $]; 370 const valueSlice = currentSlice[valueStartIndex .. indexOfQuote]; 371 popFrontAndReset(); 372 return ArgToken(kind, fullSlice, valueSlice); 373 } 374 } 375 376 // `--arg=...` 377 { 378 const fullSlice = currentSlice[initialPosition .. $]; 379 const valueSlice = currentSlice[initialPosition .. $]; 380 381 // We might want to display some more info here. 382 const kind = 383 (){ 384 import std.string : indexOf; 385 const indexOfSpace = indexOf(valueSlice, ' '); 386 if (indexOfSpace == -1) 387 { 388 return Kind.namedArgumentValue; 389 } 390 391 // If the spaces got into the string, it was malformed from the start, 392 // or we have a rare edge case (see Kind.error_spaceAfterAssignment). 393 return Kind.error_spaceAfterAssignment; 394 }(); 395 396 popFrontAndReset(); 397 return ArgToken(kind, fullSlice, valueSlice); 398 } 399 } 400 401 if (getCurrentCharacter() == '-') 402 { 403 return parseArgumentName(); 404 } 405 406 // Otherwise the entire string is just an argument value like the "value" below. 407 // ["--name", "value"]. 408 // We don't care whether it was quoted or not in the source, we just return the whole thing. 409 { 410 const kind = Kind.namedArgumentValueOrOrphanArgument; 411 const fullSlice = currentSlice; 412 const valueSlice = currentSlice; 413 _range.popFront(); 414 return ArgToken(kind, fullSlice, valueSlice); 415 } 416 } 417 418 419 // It is not a named arg (technically these checks are not needed, but let's do it just in case). 420 assert( 421 // Covers all special cases, like the first argument 422 previousKind < Kind.valueBit 423 || previousKind.hasEither(Kind.errorBit | Kind.valueBit)); 424 425 assert(_positionWithinCurrentString == 0, "??"); 426 427 if (getCurrentCharacter() == '-') 428 { 429 return parseArgumentName(); 430 } 431 432 { 433 // If it's not a named arg, then it's just a value like this 434 // --arg value 435 // or like this 436 // --arg "ba ba ba" 437 // We see it unqouted, so we just return the value 438 const kind = 439 (){ 440 // No input yet 441 if (previousKind == Kind.none) 442 return Kind.positionalArgument; 443 444 // Say, if the input is malformatted, we consider everything after that orphans, 445 // I guess this is pretty reasonable. 446 if (previousKind.has(Kind.errorBit)) 447 return Kind.orphanArgument; 448 449 if (previousKind.has(Kind.argumentNameBit)) 450 return Kind.namedArgumentValueOrOrphanArgument; 451 452 // Just to be sure nothing went wrong. 453 assert(previousKind.has(Kind.valueBit)); 454 455 if (previousKind.hasEither(Kind.orphanArgumentBit | Kind.positionalArgumentBit)) 456 { 457 // Copy the positional or the orphan bit of the previous argument. 458 return previousKind & ~Kind.namedArgumentValueBit; 459 } 460 461 assert(previousKind == Kind.namedArgumentValue); 462 return Kind.orphanArgument; 463 }(); 464 const fullSlice = currentSlice; 465 const valueSlice = currentSlice; 466 _range.popFront(); 467 return ArgToken(kind, fullSlice, valueSlice); 468 } 469 } 470 } 471 472 ArgTokenizer!TRange argTokenizer(TRange)(TRange range) 473 { 474 auto result = ArgTokenizer!TRange(range); 475 result.popFront(); 476 return result; 477 } 478 479 unittest 480 { 481 import std.algorithm : equal; 482 alias Kind = ArgToken.Kind; 483 { 484 auto args = ["hello", "world"]; 485 assert(equal(argTokenizer(args), [ 486 ArgToken(Kind.positionalArgument, "hello", "hello"), 487 ArgToken(Kind.positionalArgument, "world", "world"), 488 ])); 489 } 490 { 491 auto args = ["--hello", "world"]; 492 assert(equal(argTokenizer(args), [ 493 ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"), 494 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"), 495 ])); 496 } 497 498 { 499 auto args = ["-hello", "world"]; 500 assert(equal(argTokenizer(args), [ 501 ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"), 502 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"), 503 ])); 504 } 505 { 506 auto args = ["--hello", "world", "world2"]; 507 assert(equal(argTokenizer(args), [ 508 ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"), 509 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "world", "world"), 510 ArgToken(Kind.orphanArgument, "world2", "world2"), 511 ])); 512 } 513 { 514 auto args = ["-hello=world"]; 515 assert(equal(argTokenizer(args), [ 516 ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"), 517 ArgToken(Kind.namedArgumentValue, "world", "world"), 518 ])); 519 } 520 { 521 auto args = [`--hello="world"`]; 522 assert(equal(argTokenizer(args), [ 523 ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"), 524 ArgToken(Kind.namedArgumentValue, `"world"`, "world"), 525 ])); 526 } 527 { 528 auto args = [`--hello="--world"`]; 529 assert(equal(argTokenizer(args), [ 530 ArgToken(Kind.fullNamedArgumentName, "--hello", "hello"), 531 ArgToken(Kind.namedArgumentValue, `"--world"`, "--world"), 532 ])); 533 } 534 { 535 auto args = ["--"]; 536 assert(equal(argTokenizer(args), [ 537 ArgToken(Kind.twoDashesDelimiter, "--", "--"), 538 ])); 539 } 540 { 541 auto args = ["-"]; 542 assert(equal(argTokenizer(args), [ 543 ArgToken(Kind.error_singleDash, "-", ""), 544 ])); 545 } 546 { 547 auto args = ["---"]; 548 assert(equal(argTokenizer(args), [ 549 ArgToken(Kind.error_threeOrMoreDashes, "---", "---"), 550 ])); 551 } 552 { 553 auto args = [" "]; 554 assert(equal(argTokenizer(args), [ 555 ArgToken(Kind.positionalArgument, " ", " "), 556 ])); 557 } 558 { 559 auto args = ["--arg="]; 560 assert(equal(argTokenizer(args), [ 561 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 562 ArgToken(Kind.error_noValueForNamedArgument, "", ""), 563 ])); 564 } 565 { 566 auto args = [`--arg="`]; 567 assert(equal(argTokenizer(args), [ 568 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 569 ArgToken(Kind.error_malformedQuotes, `"`, `"`), 570 ])); 571 } 572 { 573 auto args = [`--arg="" stuff`]; 574 assert(equal(argTokenizer(args), [ 575 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 576 ArgToken(Kind.error_inputAfterClosedQuote, `"" stuff`, `" stuff`), 577 ])); 578 } 579 { 580 auto args = [`--arg="stuff`]; 581 assert(equal(argTokenizer(args), [ 582 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 583 ArgToken(Kind.error_unclosedQuotes, `"stuff`, "stuff"), 584 ])); 585 } 586 { 587 auto args = [`--arg= `]; 588 assert(equal(argTokenizer(args), [ 589 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 590 ArgToken(Kind.error_spaceAfterAssignment, " ", " "), 591 ])); 592 } 593 { 594 // --arg "--arg=stuff" 595 // is expected to parse as 596 // --arg --arg=stuff 597 auto args = [`--arg`, `--arg=stuff`]; 598 assert(equal(argTokenizer(args), [ 599 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 600 ArgToken(Kind.fullNamedArgumentName, "--arg", "arg"), 601 ArgToken(Kind.namedArgumentValue, "stuff", "stuff"), 602 ])); 603 } 604 { 605 auto args = ["a", "--b", "c", "-d=e"]; 606 assert(equal(argTokenizer(args), [ 607 ArgToken(Kind.positionalArgument, "a", "a"), 608 609 ArgToken(Kind.fullNamedArgumentName, "--b", "b"), 610 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "c", "c"), 611 612 ArgToken(Kind.shortNamedArgumentName, "-d", "d"), 613 ArgToken(Kind.namedArgumentValue, "e", "e"), 614 ])); 615 } 616 { 617 auto args = ["--a", "Штука"]; 618 assert(equal(argTokenizer(args), [ 619 ArgToken(Kind.fullNamedArgumentName, "--a", "a"), 620 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "Штука", "Штука"), 621 ])); 622 } 623 { 624 auto args = [`--a="Штука"`]; 625 assert(equal(argTokenizer(args), [ 626 ArgToken(Kind.fullNamedArgumentName, "--a", "a"), 627 ArgToken(Kind.namedArgumentValue, `"Штука"`, "Штука"), 628 ])); 629 } 630 { 631 auto args = ["--a", "物事"]; 632 assert(equal(argTokenizer(args), [ 633 ArgToken(Kind.fullNamedArgumentName, "--a", "a"), 634 ArgToken(Kind.namedArgumentValueOrOrphanArgument, "物事", "物事"), 635 ])); 636 } 637 { 638 // A tricky bug. The orphan argument after a sure positional would only have `valueBit`. 639 auto args = ["test", "-hello=world", "abc"]; 640 auto tokenizer = argTokenizer(args); 641 assert(equal(tokenizer, [ 642 ArgToken(Kind.positionalArgument, "test", "test"), 643 ArgToken(Kind.shortNamedArgumentName, "-hello", "hello"), 644 ArgToken(Kind.namedArgumentValue, "world", "world"), 645 ArgToken(Kind.orphanArgument, "abc", "abc"), 646 ])); 647 } 648 649 // // Copy and paste around for debugging. 650 651 // import std.stdio : writeln; 652 // import std.array : array; 653 654 // auto p = argTokenizer(args); 655 656 // writeln(p.front); 657 // writeln(p.front.valueSlice); 658 // // writeln(p._range.front[p._positionWithinCurrentString]); 659 // writeln(p._positionWithinCurrentString); 660 661 // p.popFront(); 662 663 // auto a = p.front(); 664 // writeln(a); 665 // writeln(a.fullSlice); 666 // writeln(a.nameSlice); 667 // writeln(a.kind); 668 // writeln(p._positionWithinCurrentString); 669 670 // p.popFront(); 671 // writeln(p.empty); 672 } 673