Skip to content

Internals

cmd.py

Module containing the implementation for the kebbie command line.

instantiate_correctors(keyboard, get_layout=True, fast_mode=True, instantiate_emulator=True)

Create the right correctors (with the right platform, etc...) given the arguments from the command line.

Parameters:

Name Type Description Default
keyboard str

Name fo the keyboard to load.

required
fast_mode bool

If True, the corrector will be instantiated in fast mode (only AC).

True
instantiate_emulator bool

If True, the emulators are instantiated (which trigger the layout detection). If False, only the corrector is instantiated, not the emulator.

True
get_layout bool

If True, The keyboard keys and suggestions will be mapped and shown on screen.

True

Returns:

Type Description
List[EmulatorCorrector]

The list of created Correctors.

Source code in kebbie/cmd.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def instantiate_correctors(
    keyboard: str, get_layout: bool = True, fast_mode: bool = True, instantiate_emulator: bool = True
) -> List[EmulatorCorrector]:
    """Create the right correctors (with the right platform, etc...) given the
    arguments from the command line.

    Args:
        keyboard (str): Name fo the keyboard to load.
        fast_mode (bool, optional): If `True`, the corrector will be
            instantiated in fast mode (only AC).
        instantiate_emulator (bool, optional): If `True`, the emulators are
            instantiated (which trigger the layout detection). If `False`, only
            the corrector is instantiated, not the emulator.
        get_layout (bool, optional):  If `True`, The keyboard keys and suggestions
            will be mapped and shown on screen.

    Returns:
        The list of created Correctors.
    """
    if keyboard in ["gboard", "tappa", "swiftkey", "yandex"]:
        # Android keyboards
        return [
            EmulatorCorrector(
                device=d,
                platform="android",
                keyboard=keyboard,
                fast_mode=fast_mode,
                instantiate_emulator=instantiate_emulator,
                get_layout=get_layout,
            )
            for d in Emulator.get_android_devices()
        ]
    else:
        # iOS keyboards
        return [
            EmulatorCorrector(
                device=i,
                platform="ios",
                keyboard=keyboard,
                fast_mode=fast_mode,
                instantiate_emulator=instantiate_emulator,
                ios_name=ios_name,
                ios_platform=ios_platform,
                get_layout=get_layout,
            )
            for i, (ios_platform, ios_name) in enumerate(Emulator.get_ios_devices())
        ]

common_args(parser)

Add common arguments to the given parser.

Parameters:

Name Type Description Default
parser ArgumentParser

Parser where to add the arguments.

required
Source code in kebbie/cmd.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def common_args(parser: argparse.ArgumentParser):
    """Add common arguments to the given parser.

    Args:
        parser (argparse.ArgumentParser): Parser where to add the arguments.
    """
    parser.add_argument(
        "--keyboard",
        "-K",
        dest="keyboard",
        type=str,
        required=True,
        choices=["gboard", "ios", "kbkitpro", "kbkitoss", "tappa", "fleksy", "swiftkey", "yandex"],
        help="Which keyboard, to be tested, is currently installed on the emulator.",
    )

cli()

Entry-point of the kebbie command line.

Source code in kebbie/cmd.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def cli():
    """Entry-point of the `kebbie` command line."""
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Kebbie's command line.")
    subparsers = parser.add_subparsers(title="commands", dest="cmd")

    evaluate_parser = subparsers.add_parser("evaluate", help="Run the evaluation using emulated keyboard.")
    evaluate_parser.set_defaults(cmd="evaluate")
    common_args(evaluate_parser)
    evaluate_parser.add_argument(
        "--result_file",
        "-R",
        dest="result_file",
        type=str,
        default="results.json",
        help="When to save the results of the evaluation",
    )
    evaluate_parser.add_argument(
        "--all_tasks",
        "-A",
        dest="all_tasks",
        action="store_true",
        default=False,
        help="If specified, all tasks are evaluated (not only auto-correction, but also auto-completion and "
        "next-word prediction).",
    )
    evaluate_parser.add_argument(
        "--n_sentences",
        "-N",
        dest="n_sentences",
        type=int,
        default=100,
        help="The number of sentences to use for the evaluation. Emulated keyboard are slow, so we can't run on the "
        "full test set. Instead we pick the first N sentences.",
    )
    evaluate_parser.add_argument(
        "--track_mistakes",
        "-T",
        dest="track_mistakes",
        action="store_true",
        default=False,
        help="If specified, mistakes will be tracked and saved in the result file.",
    )

    layout_parser = subparsers.add_parser(
        "show_layout", help="Display the layout over the keyboard for debugging purpose."
    )
    layout_parser.set_defaults(cmd="show_layout")
    common_args(layout_parser)

    page_source_parser = subparsers.add_parser(
        "get_page_source", help="Save the page source of the keyboard in a file for debugging purpose."
    )
    page_source_parser.set_defaults(cmd="get_page_source")
    common_args(page_source_parser)
    page_source_parser.add_argument(
        "--page_source_file",
        "-F",
        dest="page_source_file",
        type=str,
        default="keyboard_page_source.xml",
        help="Where to save the keyboard page source",
    )
    page_source_parser.add_argument(
        "--print_page_source",
        "-P",
        dest="print_page_source",
        action="store_true",
        default=False,
        help="If specified, the page source will be shown in console too.",
    )

    args = parser.parse_args()

    if args.cmd is None:
        parser.print_help(sys.stderr)
        sys.exit(1)
    elif args.cmd == "evaluate":
        correctors = instantiate_correctors(args.keyboard, fast_mode=not args.all_tasks, instantiate_emulator=False)

        # Get dataset, and filter it to keep only a small number of sentences
        dataset = get_soda_dataset(args.n_sentences)

        # Run the evaluation
        results = evaluate(correctors, dataset=dataset, track_mistakes=args.track_mistakes)

        # Save the results in a file
        with open(args.result_file, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=4)

        print("Overall score : ", results["overall_score"])

    elif args.cmd == "show_layout":
        correctors = instantiate_correctors(args.keyboard)
        for c in correctors:
            c.emulator.show_keyboards()
            print(f"Predictions : {c.emulator.get_predictions()}")

    elif args.cmd == "get_page_source":
        correctors = instantiate_correctors(args.keyboard, get_layout=False)

        for c in correctors:
            # Get the page source
            page_source = ET.fromstring(c.emulator.driver.page_source)

            # Get the keyboard package name
            keyboard_package = emulator.KEYBOARD_PACKAGE.get(args.keyboard, None)

            if keyboard_package:
                # Filter elements that have the specified package
                filtered_elements = [element for element in page_source if element.get("package") == keyboard_package]

                if filtered_elements:
                    # If there are filtered elements, create a new XML with those elements
                    filtered_page_source = ET.Element(page_source.tag, page_source.attrib)
                    filtered_page_source.extend(filtered_elements)
                    page_source = filtered_page_source

            page_source_str = ET.tostring(page_source, encoding="utf8").decode("utf8")

            # Print the keyboard elements to the console if specified
            if args.print_page_source:
                print(page_source_str)

            # Save the keyboard elements to a file
            with open(args.page_source_file, "w", encoding="utf-8") as file:
                file.write(page_source_str)

correctors.py

Module containing the base Corrector class.

EmulatorCorrector

Bases: Corrector

Corrector using an emulated keyboard.

Parameters:

Name Type Description Default
platform str

Name of the platform used. android or ios.

required
keyboard str

Name of the keyboard to test.

required
device str

Device UDID to use for the emulator.

None
fast_mode bool

If True, only auto-correction will be tested, and suggestions will not be retrieved. This is faster because we don't take screenshot and run the OCR.

True
instantiate_emulator bool

If False, the emulator is not initialized (It will only be initialized after being pickled). This is useful to quickly create instances of this class, without going through the whole layout detection (which takes time) 2 times : at initialization and after being pickled.

True
Source code in kebbie/correctors.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
class EmulatorCorrector(Corrector):
    """Corrector using an emulated keyboard.

    Args:
        platform (str): Name of the platform used. `android` or `ios`.
        keyboard (str): Name of the keyboard to test.
        device (str): Device UDID to use for the emulator.
        fast_mode (bool): If `True`, only auto-correction will be tested,
            and suggestions will not be retrieved. This is faster because
            we don't take screenshot and run the OCR.
        instantiate_emulator (bool): If `False`, the emulator is not
            initialized (It will only be initialized after being pickled).
            This is useful to quickly create instances of this class,
            without going through the whole layout detection (which takes
            time) 2 times : at initialization and after being pickled.
    """

    def __init__(
        self,
        platform: str,
        keyboard: str,
        device: str = None,
        fast_mode: bool = True,
        ios_name: str = None,
        ios_platform: str = None,
        instantiate_emulator: bool = True,
        get_layout: bool = True,
    ):
        super().__init__()

        self.platform = platform
        self.keyboard = keyboard
        self.device = device
        self.fast_mode = fast_mode
        self.ios_name = ios_name
        self.ios_platform = ios_platform
        self.get_layout = get_layout

        self.emulator = None
        if instantiate_emulator:
            self.emulator = Emulator(
                self.platform,
                self.keyboard,
                device=self.device,
                ios_name=self.ios_name,
                ios_platform=self.ios_platform,
                get_layout=self.get_layout,
            )

        # Typing on keyboard is slow. Because we go through several AC calls
        # in one sentence, keep track of the previously typed context, so we
        # can just type the remaining characters
        self.previous_context = ""

    def __reduce__(self) -> Tuple:
        """This method simply makes the object pickable.

        Returns:
            Tuple of callable and arguments.
        """
        return (
            self.__class__,
            (self.platform, self.keyboard, self.device, self.fast_mode, self.ios_name, self.ios_platform),
        )

    def cached_type(self, context: str, word: str):
        """This class keeps track of the content of the context currently
        typed in the emulator. This method uses this current context to
        determine if we need to retype the sentence or not. Instead of
        always erasing the content being typed, we can directly type the
        remaining characters, which saves up time.

        Args:
            context (str): Context to paste.
            word (str): Word to type.
        """
        sentence = context + word
        if sentence.startswith(self.previous_context):
            # The sentence to type start similarly as the previous context
            # Don't retype everything, just what we need
            self.emulator.type_characters(sentence[len(self.previous_context) :])
        else:
            # The previous context is not right, erase everything and type it
            self.emulator.paste(context)
            self.emulator.type_characters(word)
        self.previous_context = sentence

    def auto_correct(
        self,
        context: str,
        keystrokes: List[Optional[Tuple[float, float]]],
        word: str,
    ) -> List[str]:
        """Implementation of `auto_correct` method for emulated keyboards.

        Args:
            context (str): String representing the previously typed characters
                (the beginning of the sentence basically).
            keystrokes (List[Optional[Tuple[float, float]]]): List of positions
                (x and y coordinates) for each keystroke of the word being
                typed.
            word (str): Word being typed (corresponding to the keystrokes).

        Returns:
            The list of correction candidates.
        """
        self.cached_type(context, word)
        candidates = self.emulator.get_predictions() if not self.fast_mode else []

        candidates = [c for c in candidates if c != ""]

        # On keyboard, the leftmost candidate is the word being typed without
        # any change. If the word doesn't have a typo, this first candidate
        # should be kept as the auto-correction, but if the word has a typo,
        # we should remove it from the candidates list (as it will be
        # auto-corrected).
        # In order to know if it will be auto-corrected or not, we have no
        # choice but type a space and retrieve the current text to see if it
        # was auto-corrected or not.
        self.emulator.type_characters(" ")
        self.previous_context = self.emulator.get_text()
        autocorrection = self.previous_context[len(context) :].strip()

        if len(candidates) == 0:
            candidates = [autocorrection]
        elif candidates[0] != autocorrection:
            candidates.pop(0)
            if autocorrection not in candidates:
                candidates.insert(0, autocorrection)

        return candidates

    def auto_complete(
        self,
        context: str,
        keystrokes: List[Optional[Tuple[float, float]]],
        partial_word: str,
    ) -> List[str]:
        """Implementation of `auto_complete` method for emulated keyboards.

        Args:
            context (str): String representing the previously typed characters
                (the beginning of the sentence basically).
            keystrokes (List[Optional[Tuple[float, float]]]): List of positions
                (x and y coordinates) for each keystroke of the word being
                typed.
            partial_word (str): Partial word being typed (corresponding to the
                keystrokes).

        Returns:
            The list of completion candidates.
        """
        if self.fast_mode:
            return []

        self.cached_type(context, partial_word)
        candidates = self.emulator.get_predictions()

        candidates = [c for c in candidates if c != ""]

        return candidates

    def predict_next_word(self, context: str) -> List[str]:
        """Implementation of `predict_next_word` method for emulated keyboards.

        Args:
            context (str): String representing the previously typed characters
                (the beginning of the sentence basically).

        Returns:
            The list of next-word candidates.
        """
        if self.fast_mode:
            return []

        # In order to get the predictions, the space should be typed
        assert context[-1] == " "
        self.cached_type(context[:-1], " ")
        candidates = self.emulator.get_predictions()
        candidates = [c for c in candidates if c != ""]

        return candidates

__reduce__()

This method simply makes the object pickable.

Returns:

Type Description
Tuple

Tuple of callable and arguments.

Source code in kebbie/correctors.py
211
212
213
214
215
216
217
218
219
220
def __reduce__(self) -> Tuple:
    """This method simply makes the object pickable.

    Returns:
        Tuple of callable and arguments.
    """
    return (
        self.__class__,
        (self.platform, self.keyboard, self.device, self.fast_mode, self.ios_name, self.ios_platform),
    )

cached_type(context, word)

This class keeps track of the content of the context currently typed in the emulator. This method uses this current context to determine if we need to retype the sentence or not. Instead of always erasing the content being typed, we can directly type the remaining characters, which saves up time.

Parameters:

Name Type Description Default
context str

Context to paste.

required
word str

Word to type.

required
Source code in kebbie/correctors.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def cached_type(self, context: str, word: str):
    """This class keeps track of the content of the context currently
    typed in the emulator. This method uses this current context to
    determine if we need to retype the sentence or not. Instead of
    always erasing the content being typed, we can directly type the
    remaining characters, which saves up time.

    Args:
        context (str): Context to paste.
        word (str): Word to type.
    """
    sentence = context + word
    if sentence.startswith(self.previous_context):
        # The sentence to type start similarly as the previous context
        # Don't retype everything, just what we need
        self.emulator.type_characters(sentence[len(self.previous_context) :])
    else:
        # The previous context is not right, erase everything and type it
        self.emulator.paste(context)
        self.emulator.type_characters(word)
    self.previous_context = sentence

auto_correct(context, keystrokes, word)

Implementation of auto_correct method for emulated keyboards.

Parameters:

Name Type Description Default
context str

String representing the previously typed characters (the beginning of the sentence basically).

required
keystrokes List[Optional[Tuple[float, float]]]

List of positions (x and y coordinates) for each keystroke of the word being typed.

required
word str

Word being typed (corresponding to the keystrokes).

required

Returns:

Type Description
List[str]

The list of correction candidates.

Source code in kebbie/correctors.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def auto_correct(
    self,
    context: str,
    keystrokes: List[Optional[Tuple[float, float]]],
    word: str,
) -> List[str]:
    """Implementation of `auto_correct` method for emulated keyboards.

    Args:
        context (str): String representing the previously typed characters
            (the beginning of the sentence basically).
        keystrokes (List[Optional[Tuple[float, float]]]): List of positions
            (x and y coordinates) for each keystroke of the word being
            typed.
        word (str): Word being typed (corresponding to the keystrokes).

    Returns:
        The list of correction candidates.
    """
    self.cached_type(context, word)
    candidates = self.emulator.get_predictions() if not self.fast_mode else []

    candidates = [c for c in candidates if c != ""]

    # On keyboard, the leftmost candidate is the word being typed without
    # any change. If the word doesn't have a typo, this first candidate
    # should be kept as the auto-correction, but if the word has a typo,
    # we should remove it from the candidates list (as it will be
    # auto-corrected).
    # In order to know if it will be auto-corrected or not, we have no
    # choice but type a space and retrieve the current text to see if it
    # was auto-corrected or not.
    self.emulator.type_characters(" ")
    self.previous_context = self.emulator.get_text()
    autocorrection = self.previous_context[len(context) :].strip()

    if len(candidates) == 0:
        candidates = [autocorrection]
    elif candidates[0] != autocorrection:
        candidates.pop(0)
        if autocorrection not in candidates:
            candidates.insert(0, autocorrection)

    return candidates

auto_complete(context, keystrokes, partial_word)

Implementation of auto_complete method for emulated keyboards.

Parameters:

Name Type Description Default
context str

String representing the previously typed characters (the beginning of the sentence basically).

required
keystrokes List[Optional[Tuple[float, float]]]

List of positions (x and y coordinates) for each keystroke of the word being typed.

required
partial_word str

Partial word being typed (corresponding to the keystrokes).

required

Returns:

Type Description
List[str]

The list of completion candidates.

Source code in kebbie/correctors.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
def auto_complete(
    self,
    context: str,
    keystrokes: List[Optional[Tuple[float, float]]],
    partial_word: str,
) -> List[str]:
    """Implementation of `auto_complete` method for emulated keyboards.

    Args:
        context (str): String representing the previously typed characters
            (the beginning of the sentence basically).
        keystrokes (List[Optional[Tuple[float, float]]]): List of positions
            (x and y coordinates) for each keystroke of the word being
            typed.
        partial_word (str): Partial word being typed (corresponding to the
            keystrokes).

    Returns:
        The list of completion candidates.
    """
    if self.fast_mode:
        return []

    self.cached_type(context, partial_word)
    candidates = self.emulator.get_predictions()

    candidates = [c for c in candidates if c != ""]

    return candidates

predict_next_word(context)

Implementation of predict_next_word method for emulated keyboards.

Parameters:

Name Type Description Default
context str

String representing the previously typed characters (the beginning of the sentence basically).

required

Returns:

Type Description
List[str]

The list of next-word candidates.

Source code in kebbie/correctors.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
def predict_next_word(self, context: str) -> List[str]:
    """Implementation of `predict_next_word` method for emulated keyboards.

    Args:
        context (str): String representing the previously typed characters
            (the beginning of the sentence basically).

    Returns:
        The list of next-word candidates.
    """
    if self.fast_mode:
        return []

    # In order to get the predictions, the space should be typed
    assert context[-1] == " "
    self.cached_type(context[:-1], " ")
    candidates = self.emulator.get_predictions()
    candidates = [c for c in candidates if c != ""]

    return candidates

emulator.py

Module containing the code necessary to interact with the emulators, using Appium.

Emulator

Class used to interact with an emulator and type word on a given keyboard.

Parameters:

Name Type Description Default
platform str

android or ios.

required
keyboard str

The name of the keyboard installed on the emulator. This is needed because each keyboard has a different layout, and we need to know each key's position in order to type words.

required
device str

Device UDID to use.

None
host str

Appium server's address.

'127.0.0.1'
port str

Appium server's port.

'4723'
get_layout bool

Set False to don't map the keys.

True

Raises:

Type Description
ValueError

Error raised if the given platform doesn't exist.

Source code in kebbie/emulator.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
class Emulator:
    """Class used to interact with an emulator and type word on a given keyboard.

    Args:
        platform (str): `android` or `ios`.
        keyboard (str): The name of the keyboard installed on the emulator.
            This is needed because each keyboard has a different layout, and we
            need to know each key's position in order to type words.
        device (str, optional): Device UDID to use.
        host (str, optional): Appium server's address.
        port (str, optional): Appium server's port.
        get_layout (bool, optional): Set False to don't map the keys.

    Raises:
        ValueError: Error raised if the given platform doesn't exist.
    """

    def __init__(  # noqa: C901
        self,
        platform: str,
        keyboard: str,
        device: str = None,
        host: str = "127.0.0.1",
        port: str = "4723",
        ios_name: str = None,
        ios_platform: str = None,
        get_layout: bool = True,
    ):
        super().__init__()

        self.platform = platform.lower()
        if self.platform not in [ANDROID, IOS]:
            raise ValueError(f"Unknown platform : {self.platform}. Please specify `{ANDROID}` or `{IOS}`.")

        # Start appium
        capabilities = ANDROID_CAPABILITIES if self.platform == ANDROID else IOS_CAPABILITIES
        if self.platform == IOS:
            capabilities["deviceName"] = ios_name
            capabilities["platformVersion"] = ios_platform
            capabilities["wdaLocalPort"] = 8000 + (device if device is not None else 0)
        if self.platform == ANDROID and device is not None:
            capabilities["udid"] = device
        self.driver = webdriver.Remote(f"{host}:{port}", capabilities)
        self.driver.implicitly_wait(20)

        self.screen_size = self.driver.get_window_size()

        self.keyboard = keyboard.lower()

        # Access a typing field
        self.typing_field = None
        self._access_typing_field()

        # Keep track of the keyboard behavior
        # When the typing field is empty, the keyboard is uppercase by default
        self.kb_is_upper = True
        self.last_char_is_space = False
        self.last_char_is_eos = False

        # Set the keyboard as default
        if self.platform == ANDROID:
            self.select_keyboard(keyboard)

        # Get the right layout
        if get_layout:
            if self.keyboard == GBOARD:
                self.detected = GboardLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == TAPPA:
                self.detected = TappaLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == FLEKSY:
                self.detected = FleksyLayoutDetector(self.driver)
                self.layout = self.detected.layout
            elif self.keyboard == IOS:
                self.detected = IosLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == KBKITPRO:
                self.detected = KbkitproLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == KBKITOSS:
                self.detected = KbkitossLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == SWIFTKEY:
                self.detected = SwiftkeyLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            elif self.keyboard == YANDEX:
                self.detected = YandexLayoutDetector(self.driver, self._tap)
                self.layout = self.detected.layout
            else:
                raise ValueError(
                    f"Unknown keyboard : {self.keyboard}. Please specify `{GBOARD}`, `{TAPPA}`, `{FLEKSY}`, "
                    f"`{SWIFTKEY}`, `{YANDEX}`, `{KBKITPRO}`, `{KBKITOSS}` or `{IOS}`."
                )

        self.typing_field.clear()

    def _access_typing_field(self):
        """Start the right application and access the typing field where we
        will type our text.
        """
        if self.platform == ANDROID:
            subprocess.run(
                ["adb", "shell", "am", "start", "-a", "android.intent.action.VIEW", "-d", BROWSER_PAD_URL],
                stdout=subprocess.PIPE,
            )
            typing_field_loaded = False
            while not typing_field_loaded:
                typing_fields = self.driver.find_elements(By.CLASS_NAME, ANDROID_TYPING_FIELD_CLASS_NAME)
                typing_field_loaded = len(typing_fields) == 2
            self.typing_field = typing_fields[0]
        else:
            self.driver.find_element(By.CLASS_NAME, IOS_START_CHAT_CLASS_NAME).click()
            self.typing_field = self.driver.find_element(By.ID, IOS_TYPING_FIELD_ID)
        self.typing_field.click()
        self.typing_field.clear()

    def get_android_devices() -> List[str]:
        """Static method that uses the `adb devices` command to retrieve the
        list of devices running.

        Returns:
            List of detected device UDID.
        """
        result = subprocess.run(["adb", "devices"], stdout=subprocess.PIPE)
        devices = result.stdout.decode().split("\n")
        devices = [d.split()[0] for d in devices if not (d.startswith("List of devices attached") or len(d) == 0)]
        return devices

    def select_keyboard(self, keyboard):
        """Searches the IME of the desired keyboard and selects it, only for Android.

        Args:
            keyboard (str): Keyboard to search.
        """
        if keyboard not in KEYBOARD_PACKAGE:
            print(
                f"Warning ! {keyboard}'s IME isn't provided (in `KEYBOARD_PACKAGE`), can't automatically select the "
                "keyboard."
            )
            return

        ime_list = subprocess.check_output(["adb", "shell", "ime", "list", "-s"], universal_newlines=True)
        ime_name = None
        for ime in ime_list.strip().split("\n"):
            if KEYBOARD_PACKAGE[keyboard] in ime:
                ime_name = ime
                break
        if ime_name:
            subprocess.run(
                ["adb", "shell", "settings", "put", "secure", "show_ime_with_hard_keyboard", "1"],
                stdout=subprocess.PIPE,
            )
            subprocess.run(["adb", "shell", "ime", "enable", ime_name], stdout=subprocess.PIPE)
            subprocess.run(["adb", "shell", "ime", "set", ime_name], stdout=subprocess.PIPE)

    def get_ios_devices() -> List[Tuple[str, str]]:
        """Static method that uses the `xcrun simctl` command to retrieve the
        list of booted devices.

        Returns:
            List of booted device platform and device name.
        """
        devices = []

        result = subprocess.run(["xcrun", "simctl", "list", "devices"], stdout=subprocess.PIPE)
        out = result.stdout.decode().split("\n")

        curr_platform = ""
        for line in out:
            if line.startswith("== ") and line.endswith(" =="):
                continue
            elif line.startswith("-- ") and line.endswith(" --"):
                curr_platform = line[3:-3]
            else:
                m = re.match(r"\s+([^\t]+)\s+\([A-Z0-9\-]+\)\s+\((Booted|Shutdown)\)", line)
                if m:
                    device_name = m.group(1)
                    status = m.group(2)

                    if status == "Booted" and curr_platform.startswith("iOS "):
                        devices.append((curr_platform[4:], device_name))

        return devices

    def _paste(self, text: str):
        """Paste the given text into the typing field, to quickly simulate
        typing a context.

        Args:
            text (str): Text to paste.
        """
        if text == "":
            self.typing_field.clear()
            self.kb_is_upper = True
            self.last_char_is_space = False
            self.last_char_is_eos = False
        else:
            # Note : on Android, pasting content in the field will erase the previous content
            # (which is what we want). On iOS it will not, we need to do it "manually"
            if self.platform == IOS:
                self.typing_field.clear()
            if self.keyboard == KBKITPRO or self.keyboard == KBKITOSS or self.keyboard == FLEKSY:
                # In the case of KeyboardKit / Fleksy, after pasting the content, typing a space
                # trigger a punctuation (because previous context may end with a space)
                # To avoid this behavior, break the cycle by typing a backspace
                self._tap(self.layout["lowercase"]["backspace"])
            self.typing_field.send_keys(text)
            self.kb_is_upper = len(text) > 1 and self._is_eos(text[-2]) and text.endswith(" ")
            self.last_char_is_space = text.endswith(" ")
            self.last_char_is_eos = self._is_eos(text[-1])

    def paste(self, text: str):
        """Paste the given text into the typing field, to quickly simulate
        typing a context.

        This method is just a wrapper around `_paste()`, making sure the typing
        field is accessible. If for some reason it is not accessible, it tries
        to access it and perform the action again.

        Args:
            text (str): Text to paste.
        """
        try:
            self._paste(text)
        except StaleElementReferenceException:
            self._access_typing_field()
            self._paste(text)

    def type_characters(self, characters: str):  # noqa: C901
        """Type the given sentence on the keyboard. For each character, it
        finds the keys to press and send a tap on the keyboard.

        Args:
            characters (str): The sentence to type.
        """
        for c in characters:
            if c == " ":
                if self.last_char_is_space:
                    # If the previous character was a space, don't retype a space
                    # because it can be transformed into a `.`
                    continue

                if self.kb_is_upper:
                    self._tap(self.layout["uppercase"]["spacebar"])
                else:
                    self._tap(self.layout["lowercase"]["spacebar"])

                # Behavior of the keyboard : if the previous character typed was an EOS marker
                # and a space is typed, the keyboard automatically switch to uppercase
                if self.last_char_is_eos:
                    self.kb_is_upper = True
            elif c in self.layout["lowercase"]:
                # The character is a lowercase character
                if self.kb_is_upper:
                    # If the keyboard is in uppercase mode, change it to lowercase
                    self._tap(self.layout["uppercase"]["shift"])
                    if self.keyboard == SWIFTKEY:
                        # Swiftkey needs double tap, otherwise we are capslocking
                        self._tap(self.layout["uppercase"]["shift"])
                self._tap(self.layout["lowercase"][c])
            elif c in self.layout["uppercase"]:
                # The character is an uppercase character
                if not self.kb_is_upper:
                    # Change the keyboard to uppercase
                    self._tap(self.layout["lowercase"]["shift"])
                self._tap(self.layout["uppercase"][c])
                # After typing one character, the keyboard automatically come back to lowercase
            elif c in self.layout["numbers"]:
                # The character is a number of a special character
                # Access the number keyboard properly
                if self.kb_is_upper:
                    self._tap(self.layout["uppercase"]["numbers"])
                else:
                    self._tap(self.layout["lowercase"]["numbers"])
                self._tap(self.layout["numbers"][c])

                if c != "'" or self.keyboard in [GBOARD, SWIFTKEY]:
                    # For some reason, when `'` is typed, the keyboard automatically goes back
                    # to lowercase, so no need to re-tap the button (unless the keyboard is GBoard / Swiftkey).
                    # In all other cases, switch back to letters keyboard
                    self._tap(self.layout["numbers"]["letters"])
            else:
                # Can't type this character, ignore it
                continue

            # Behavior of the keyboard : if the previous character typed was an EOS marker
            # and a space is typed, the keyboard automatically switch to uppercase
            self.kb_is_upper = self.last_char_is_eos and c == " "

            # Update infos about what we typed
            self.last_char_is_eos = self._is_eos(c)
            self.last_char_is_space = c == " "

    def _is_eos(self, c: str) -> bool:
        """Check if the given character is an End-Of-Sentence marker. If an EOS
        marker is typed followed by a space, the keyboard automatically switch
        to uppercase letters (unless it's GBoard).

        Args:
            c (str): Character to check.

        Returns:
            True if the character is an EOS marker.
        """
        if self.keyboard == GBOARD:
            return False
        else:
            return c in [".", "!", "?"]

    def _tap(self, frame: List[int], keyboard_frame: List[int] = None):
        """Tap on the screen at the position described by the given frame.

        Args:
            frame (List[int]): Frame describing the position where to tap. A
                frame is : [start_pos_x, start_pos_y, width, height].
            keyboard_frame (List[int]): If specified, the Keyboard frame to
                use. If `None`, it will use `self.layout["keyboard_frame"]`.
        """
        x, y, w, h = frame
        base_x, base_y, *_ = keyboard_frame if keyboard_frame else self.layout["keyboard_frame"]

        pos_x = base_x + x + int(w / 2)
        pos_y = base_y + y + int(h / 2)

        actions = ActionChains(self.driver)
        actions.w3c_actions = ActionBuilder(self.driver, mouse=PointerInput(interaction.POINTER_TOUCH, "touch"))
        actions.w3c_actions.pointer_action.move_to_location(pos_x, pos_y)
        actions.w3c_actions.pointer_action.pointer_down()
        actions.w3c_actions.pointer_action.pause(0.05)
        actions.w3c_actions.pointer_action.release()
        actions.perform()

    def _take_screenshot(self):
        """Take a screenshot of the full screen.

        Returns:
            The image of the screen.
        """
        screen_data = self.driver.get_screenshot_as_png()
        screen = np.asarray(Image.open(io.BytesIO(screen_data)))
        return screen.copy()

    def get_predictions(self, lang: str = "en") -> List[str]:
        """Retrieve the predictions displayed by the keyboard.

        Args:
            lang (str): Language to use for the OCR.

        Returns:
            List of predictions from the keyboard.
        """
        if hasattr(self, "detected"):
            # Only keyboards that were auto-detected (using XML tree) have the
            # attribute `detected`. If that's the case, it means we
            # can retrieve the suggestions directly from the XML tree !
            predictions = self.detected.get_suggestions()
        else:
            # Other keyboards still have to use (slow) OCR
            time.sleep(PREDICTION_DELAY)
            screen = self._take_screenshot()

            kb_x, kb_y, kb_w, kb_h = self.layout["keyboard_frame"]
            screen = screen[kb_y : kb_y + kb_h, kb_x : kb_x + kb_w]

            predictions = []
            for x, y, w, h in self.layout["suggestions_frames"]:
                suggestion_area = screen[y : y + h, x : x + w]
                ocr_results = pytesseract.image_to_string(suggestion_area, config=TESSERACT_CONFIG)
                pred = ocr_results.strip().replace("“", "").replace('"', "").replace("\\", "")
                predictions.append(pred)

        return predictions

    def _get_text(self) -> str:
        """Return the text currently contained in the typing field.

        Returns:
            Text of the typing field.
        """
        return self.typing_field.text

    def get_text(self) -> str:
        """Return the text currently contained in the typing field.

        This method is just a wrapper around `_get_text()`, making sure the
        typing field is accessible. If for some reason it is not accessible, it
        tries to access it and perform the action again.

        Returns:
            Text of the typing field.
        """
        try:
            return self._get_text()
        except StaleElementReferenceException:
            self._access_typing_field()
            return self._get_text()

    def show_keyboards(self):
        """Take a screenshot and overlay the given layout, for debugging the
        position of each keys.
        """
        # Type a character, in order to have some suggestions
        # Keyboard starts with uppercase letter by default (unless GBoard), and
        # automatically go to lowercase after
        if self.keyboard == GBOARD:
            self._tap(self.layout["lowercase"]["a"])
        else:
            self._tap(self.layout["uppercase"]["A"])
        screen_lower = self._take_screenshot()

        self._tap(self.layout["lowercase"]["shift"])
        screen_upper = self._take_screenshot()

        self._tap(self.layout["lowercase"]["numbers"])
        screen_numbers = self._take_screenshot()

        for layout_name, screen in zip(
            ["lowercase", "uppercase", "numbers"], [screen_lower, screen_upper, screen_numbers]
        ):
            self._set_area_box(screen, (0, 0), self.layout["keyboard_frame"], "keyboard frame")
            if "suggestions_frames" in self.layout:
                for i, suggestion_frame in enumerate(self.layout["suggestions_frames"]):
                    self._set_area_box(screen, self.layout["keyboard_frame"], suggestion_frame, f"suggestion {i}")
            for key_name, key_frame in self.layout[layout_name].items():
                self._set_area_box(screen, self.layout["keyboard_frame"], key_frame, key_name)

            cv2.imshow(layout_name, screen)

        cv2.waitKey(0)
        cv2.destroyAllWindows()

    def _set_area_box(self, image, base_coords: Tuple[int], coords: Tuple[int], tag: str):
        """Add an area box on the given image (color is random).

        Args:
            image: Image where to add the box.
            base_coords (Tuple[int]): Base coordinates from the full image.
            coords (Tuple[int]): Coordinates of the element, as well as
                dimensions.
            tag (str): Tag for this box.
        """
        base_x, base_y, *_ = base_coords
        x, y, w, h = coords
        x += base_x
        y += base_y
        # Generate color only until 200, to ensure it's dark enough
        color = (random.randint(0, 200), random.randint(0, 200), random.randint(0, 200))
        cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
        cv2.putText(image, tag, (x, y + h + 17), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

get_android_devices()

Static method that uses the adb devices command to retrieve the list of devices running.

Returns:

Type Description
List[str]

List of detected device UDID.

Source code in kebbie/emulator.py
403
404
405
406
407
408
409
410
411
412
413
def get_android_devices() -> List[str]:
    """Static method that uses the `adb devices` command to retrieve the
    list of devices running.

    Returns:
        List of detected device UDID.
    """
    result = subprocess.run(["adb", "devices"], stdout=subprocess.PIPE)
    devices = result.stdout.decode().split("\n")
    devices = [d.split()[0] for d in devices if not (d.startswith("List of devices attached") or len(d) == 0)]
    return devices

select_keyboard(keyboard)

Searches the IME of the desired keyboard and selects it, only for Android.

Parameters:

Name Type Description Default
keyboard str

Keyboard to search.

required
Source code in kebbie/emulator.py
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
def select_keyboard(self, keyboard):
    """Searches the IME of the desired keyboard and selects it, only for Android.

    Args:
        keyboard (str): Keyboard to search.
    """
    if keyboard not in KEYBOARD_PACKAGE:
        print(
            f"Warning ! {keyboard}'s IME isn't provided (in `KEYBOARD_PACKAGE`), can't automatically select the "
            "keyboard."
        )
        return

    ime_list = subprocess.check_output(["adb", "shell", "ime", "list", "-s"], universal_newlines=True)
    ime_name = None
    for ime in ime_list.strip().split("\n"):
        if KEYBOARD_PACKAGE[keyboard] in ime:
            ime_name = ime
            break
    if ime_name:
        subprocess.run(
            ["adb", "shell", "settings", "put", "secure", "show_ime_with_hard_keyboard", "1"],
            stdout=subprocess.PIPE,
        )
        subprocess.run(["adb", "shell", "ime", "enable", ime_name], stdout=subprocess.PIPE)
        subprocess.run(["adb", "shell", "ime", "set", ime_name], stdout=subprocess.PIPE)

get_ios_devices()

Static method that uses the xcrun simctl command to retrieve the list of booted devices.

Returns:

Type Description
List[Tuple[str, str]]

List of booted device platform and device name.

Source code in kebbie/emulator.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def get_ios_devices() -> List[Tuple[str, str]]:
    """Static method that uses the `xcrun simctl` command to retrieve the
    list of booted devices.

    Returns:
        List of booted device platform and device name.
    """
    devices = []

    result = subprocess.run(["xcrun", "simctl", "list", "devices"], stdout=subprocess.PIPE)
    out = result.stdout.decode().split("\n")

    curr_platform = ""
    for line in out:
        if line.startswith("== ") and line.endswith(" =="):
            continue
        elif line.startswith("-- ") and line.endswith(" --"):
            curr_platform = line[3:-3]
        else:
            m = re.match(r"\s+([^\t]+)\s+\([A-Z0-9\-]+\)\s+\((Booted|Shutdown)\)", line)
            if m:
                device_name = m.group(1)
                status = m.group(2)

                if status == "Booted" and curr_platform.startswith("iOS "):
                    devices.append((curr_platform[4:], device_name))

    return devices

paste(text)

Paste the given text into the typing field, to quickly simulate typing a context.

This method is just a wrapper around _paste(), making sure the typing field is accessible. If for some reason it is not accessible, it tries to access it and perform the action again.

Parameters:

Name Type Description Default
text str

Text to paste.

required
Source code in kebbie/emulator.py
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
def paste(self, text: str):
    """Paste the given text into the typing field, to quickly simulate
    typing a context.

    This method is just a wrapper around `_paste()`, making sure the typing
    field is accessible. If for some reason it is not accessible, it tries
    to access it and perform the action again.

    Args:
        text (str): Text to paste.
    """
    try:
        self._paste(text)
    except StaleElementReferenceException:
        self._access_typing_field()
        self._paste(text)

type_characters(characters)

Type the given sentence on the keyboard. For each character, it finds the keys to press and send a tap on the keyboard.

Parameters:

Name Type Description Default
characters str

The sentence to type.

required
Source code in kebbie/emulator.py
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
def type_characters(self, characters: str):  # noqa: C901
    """Type the given sentence on the keyboard. For each character, it
    finds the keys to press and send a tap on the keyboard.

    Args:
        characters (str): The sentence to type.
    """
    for c in characters:
        if c == " ":
            if self.last_char_is_space:
                # If the previous character was a space, don't retype a space
                # because it can be transformed into a `.`
                continue

            if self.kb_is_upper:
                self._tap(self.layout["uppercase"]["spacebar"])
            else:
                self._tap(self.layout["lowercase"]["spacebar"])

            # Behavior of the keyboard : if the previous character typed was an EOS marker
            # and a space is typed, the keyboard automatically switch to uppercase
            if self.last_char_is_eos:
                self.kb_is_upper = True
        elif c in self.layout["lowercase"]:
            # The character is a lowercase character
            if self.kb_is_upper:
                # If the keyboard is in uppercase mode, change it to lowercase
                self._tap(self.layout["uppercase"]["shift"])
                if self.keyboard == SWIFTKEY:
                    # Swiftkey needs double tap, otherwise we are capslocking
                    self._tap(self.layout["uppercase"]["shift"])
            self._tap(self.layout["lowercase"][c])
        elif c in self.layout["uppercase"]:
            # The character is an uppercase character
            if not self.kb_is_upper:
                # Change the keyboard to uppercase
                self._tap(self.layout["lowercase"]["shift"])
            self._tap(self.layout["uppercase"][c])
            # After typing one character, the keyboard automatically come back to lowercase
        elif c in self.layout["numbers"]:
            # The character is a number of a special character
            # Access the number keyboard properly
            if self.kb_is_upper:
                self._tap(self.layout["uppercase"]["numbers"])
            else:
                self._tap(self.layout["lowercase"]["numbers"])
            self._tap(self.layout["numbers"][c])

            if c != "'" or self.keyboard in [GBOARD, SWIFTKEY]:
                # For some reason, when `'` is typed, the keyboard automatically goes back
                # to lowercase, so no need to re-tap the button (unless the keyboard is GBoard / Swiftkey).
                # In all other cases, switch back to letters keyboard
                self._tap(self.layout["numbers"]["letters"])
        else:
            # Can't type this character, ignore it
            continue

        # Behavior of the keyboard : if the previous character typed was an EOS marker
        # and a space is typed, the keyboard automatically switch to uppercase
        self.kb_is_upper = self.last_char_is_eos and c == " "

        # Update infos about what we typed
        self.last_char_is_eos = self._is_eos(c)
        self.last_char_is_space = c == " "

get_predictions(lang='en')

Retrieve the predictions displayed by the keyboard.

Parameters:

Name Type Description Default
lang str

Language to use for the OCR.

'en'

Returns:

Type Description
List[str]

List of predictions from the keyboard.

Source code in kebbie/emulator.py
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
def get_predictions(self, lang: str = "en") -> List[str]:
    """Retrieve the predictions displayed by the keyboard.

    Args:
        lang (str): Language to use for the OCR.

    Returns:
        List of predictions from the keyboard.
    """
    if hasattr(self, "detected"):
        # Only keyboards that were auto-detected (using XML tree) have the
        # attribute `detected`. If that's the case, it means we
        # can retrieve the suggestions directly from the XML tree !
        predictions = self.detected.get_suggestions()
    else:
        # Other keyboards still have to use (slow) OCR
        time.sleep(PREDICTION_DELAY)
        screen = self._take_screenshot()

        kb_x, kb_y, kb_w, kb_h = self.layout["keyboard_frame"]
        screen = screen[kb_y : kb_y + kb_h, kb_x : kb_x + kb_w]

        predictions = []
        for x, y, w, h in self.layout["suggestions_frames"]:
            suggestion_area = screen[y : y + h, x : x + w]
            ocr_results = pytesseract.image_to_string(suggestion_area, config=TESSERACT_CONFIG)
            pred = ocr_results.strip().replace("“", "").replace('"', "").replace("\\", "")
            predictions.append(pred)

    return predictions

get_text()

Return the text currently contained in the typing field.

This method is just a wrapper around _get_text(), making sure the typing field is accessible. If for some reason it is not accessible, it tries to access it and perform the action again.

Returns:

Type Description
str

Text of the typing field.

Source code in kebbie/emulator.py
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
def get_text(self) -> str:
    """Return the text currently contained in the typing field.

    This method is just a wrapper around `_get_text()`, making sure the
    typing field is accessible. If for some reason it is not accessible, it
    tries to access it and perform the action again.

    Returns:
        Text of the typing field.
    """
    try:
        return self._get_text()
    except StaleElementReferenceException:
        self._access_typing_field()
        return self._get_text()

show_keyboards()

Take a screenshot and overlay the given layout, for debugging the position of each keys.

Source code in kebbie/emulator.py
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
def show_keyboards(self):
    """Take a screenshot and overlay the given layout, for debugging the
    position of each keys.
    """
    # Type a character, in order to have some suggestions
    # Keyboard starts with uppercase letter by default (unless GBoard), and
    # automatically go to lowercase after
    if self.keyboard == GBOARD:
        self._tap(self.layout["lowercase"]["a"])
    else:
        self._tap(self.layout["uppercase"]["A"])
    screen_lower = self._take_screenshot()

    self._tap(self.layout["lowercase"]["shift"])
    screen_upper = self._take_screenshot()

    self._tap(self.layout["lowercase"]["numbers"])
    screen_numbers = self._take_screenshot()

    for layout_name, screen in zip(
        ["lowercase", "uppercase", "numbers"], [screen_lower, screen_upper, screen_numbers]
    ):
        self._set_area_box(screen, (0, 0), self.layout["keyboard_frame"], "keyboard frame")
        if "suggestions_frames" in self.layout:
            for i, suggestion_frame in enumerate(self.layout["suggestions_frames"]):
                self._set_area_box(screen, self.layout["keyboard_frame"], suggestion_frame, f"suggestion {i}")
        for key_name, key_frame in self.layout[layout_name].items():
            self._set_area_box(screen, self.layout["keyboard_frame"], key_frame, key_name)

        cv2.imshow(layout_name, screen)

    cv2.waitKey(0)
    cv2.destroyAllWindows()

LayoutDetector

Base class for auto-detection of the keyboard layout.

To auto-detect a new keyboard, create a new sub-class, and overwite __init__() and get_suggestions(). Use the existing subclass for GBoard as reference.

Parameters:

Name Type Description Default
driver Remote

The Appium driver, used to access elements on the emulator.

required
tap_fn Callable

A callback used to tap at specific position on the screen. See Emulator._tap().

required
xpath_root str

XPath to the root element of the keyboard.

required
xpath_keys str

XPath to detect the keys elements.

required
Source code in kebbie/emulator.py
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
class LayoutDetector:
    """Base class for auto-detection of the keyboard layout.

    To auto-detect a new keyboard, create a new sub-class, and overwite
    `__init__()` and `get_suggestions()`. Use the existing subclass for GBoard
    as reference.

    Args:
        driver (webdriver.Remote): The Appium driver, used to access elements
            on the emulator.
        tap_fn (Callable): A callback used to tap at specific position on the
            screen. See `Emulator._tap()`.
        xpath_root (str): XPath to the root element of the keyboard.
        xpath_keys (str): XPath to detect the keys elements.
    """

    def __init__(
        self, driver: webdriver.Remote, tap_fn: Callable, xpath_root: str, xpath_keys: str, android: bool = True
    ):
        self.driver = driver
        self.tap = tap_fn
        self.xpath_root = xpath_root
        self.xpath_keys = xpath_keys
        self.android = android

        layout = {}

        # Get the root element of our keyboard
        root = self.driver.find_element(By.XPATH, self.xpath_root)

        # On empty field, the keyboard is on uppercase
        # So first, retrieve the keyboard frame and uppercase characters
        kb_frame, screen_layout = self._detect_keys(root, current_layout="uppercase")
        layout["keyboard_frame"] = kb_frame
        layout["uppercase"] = screen_layout

        # Then, after typing a letter, the keyboard goes to lowercase automatically
        self.tap(layout["uppercase"]["A"], layout["keyboard_frame"])
        _, screen_layout = self._detect_keys(root, keyboard_frame=layout["keyboard_frame"], current_layout="lowercase")
        layout["lowercase"] = screen_layout

        # Finally, access the symbols keyboard and get characters positions
        self.tap(layout["lowercase"]["numbers"], layout["keyboard_frame"])
        _, screen_layout = self._detect_keys(root, keyboard_frame=layout["keyboard_frame"], current_layout="numbers")
        layout["numbers"] = screen_layout

        # Reset out keyboard to the original layer
        self.tap(layout["numbers"]["letters"], layout["keyboard_frame"])

        self.layout = layout

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Note that it's slower to access the XML through methods like
        `find_element()`, and it's faster to access the raw XML with
        `self.driver.page_source` and parse it as text directly.

        Raises:
            NotImplementedError: Exception raised if this method is not
                overwritten.

        Returns:
            List of suggestions from the keyboard.
        """
        raise NotImplementedError

    def _detect_keys(
        self, root: WebElement, current_layout: str, keyboard_frame: List[int] = None
    ) -> Tuple[List[int], Dict]:
        """This method detects all keys currently on screen.

        If no keyboard_frame is given, it will also detects the keyboard frame.

        Args:
            root (WebElement): Root element in the XML tree that represents the
                keyboard (with all its keys).
            current_layout (str): Name of the current layout.
            keyboard_frame (List[int], optional): Optionally, the keyboard
                frame (so we don't need to re-detect it everytime).

        Returns:
            Keyboard frame
            Layout with all the keys detected on this screen.
        """
        layout = {}
        if keyboard_frame is None:
            if self.android:
                # Detect the keyboard frame
                kb = root.find_element(By.ID, "android:id/inputArea")
                keyboard_frame = self._get_frame(kb)
            else:
                keyboard_frame = self._get_frame(root)

        for key_elem in root.find_elements(By.XPATH, self.xpath_keys):
            label = self._get_label(key_elem, current_layout=current_layout)
            if label is not None:
                layout[label] = self._get_frame(key_elem)

        # Then update the letters positions to be relative to the keyboard frame
        for k in layout:
            layout[k][0] -= keyboard_frame[0]
            layout[k][1] -= keyboard_frame[1]

        return keyboard_frame, layout

    def _get_frame(self, element: WebElement) -> List[int]:
        """For layout detection, this method returns the bounds of the given
        element.

        Args:
            element (WebElement): XML Element describing a key.

        Returns:
            Bounds of this key.
        """
        if self.android:
            m = re.match(r"\[(\d+),(\d+)\]\[(\d+),(\d+)\]", element.get_attribute("bounds"))
            if m:
                bounds = [int(g) for g in m.groups()]
                return [bounds[0], bounds[1], bounds[2] - bounds[0], bounds[3] - bounds[1]]
        else:
            r = json.loads(element.get_attribute("rect"))
            return [r["x"], r["y"], r["width"], r["height"]]

    def _get_label(self, element: WebElement, current_layout: str, is_suggestion: bool = False) -> str:
        """For layout detection, this method returns the content of the given
        element.

        This method returns `None` if it's a key we don't care about. This
        method takes care of translating the content (the name used in the XML
        tree is not the same as the one used in our layout).

        Args:
            element (WebElement): XML Element describing a key.
            current_layout (str): Name of the current layout.
            is_suggestion (bool, optional): If we are retrieving the content of
                a suggestion, the content shouldn't be translated.

        Returns:
            Content of the key, or None if it's a key we should ignore.
        """
        content = element.get_attribute("content-desc") if self.android else element.get_attribute("name")

        if is_suggestion:
            # If we are getting the content of the suggestion, return the content directly
            return content

        if content in CONTENT_TO_IGNORE:
            return None
        elif not self.android and content == "more":
            if current_layout == "uppercase" or current_layout == "lowercase":
                return "numbers"
            else:
                return "letters"
        elif content in CONTENT_TO_RENAME:
            return CONTENT_TO_RENAME[content]
        else:
            return content

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Note that it's slower to access the XML through methods like find_element(), and it's faster to access the raw XML with self.driver.page_source and parse it as text directly.

Raises:

Type Description
NotImplementedError

Exception raised if this method is not overwritten.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Note that it's slower to access the XML through methods like
    `find_element()`, and it's faster to access the raw XML with
    `self.driver.page_source` and parse it as text directly.

    Raises:
        NotImplementedError: Exception raised if this method is not
            overwritten.

    Returns:
        List of suggestions from the keyboard.
    """
    raise NotImplementedError

GboardLayoutDetector

Bases: LayoutDetector

Layout detector for the Gboard keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
class GboardLayoutDetector(LayoutDetector):
    """Layout detector for the Gboard keyboard. See `LayoutDetector` for more
    information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=f"./*/*[@package='{KEYBOARD_PACKAGE[GBOARD]}']",
            xpath_keys=".//*[@resource-id][@content-desc]",
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        sections = [
            data
            for data in self.driver.page_source.split("<android.widget.FrameLayout")
            if "com.google.android.inputmethod" in data
        ]
        for section in sections:
            if "content-desc" in section and "resource-id" not in section and 'long-clickable="true"' in section:
                m = re.search(r"content\-desc=\"([^\"]*)\"", section)
                if m:
                    content = m.group(1)

                    # Deal with emojis
                    emoji = re.match(r"emoji (&[^;]+;)", content)
                    suggestions.append(html.unescape(emoji[1]) if emoji else content)

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    sections = [
        data
        for data in self.driver.page_source.split("<android.widget.FrameLayout")
        if "com.google.android.inputmethod" in data
    ]
    for section in sections:
        if "content-desc" in section and "resource-id" not in section and 'long-clickable="true"' in section:
            m = re.search(r"content\-desc=\"([^\"]*)\"", section)
            if m:
                content = m.group(1)

                # Deal with emojis
                emoji = re.match(r"emoji (&[^;]+;)", content)
                suggestions.append(html.unescape(emoji[1]) if emoji else content)

    return suggestions

IosLayoutDetector

Bases: LayoutDetector

Layout detector for the iOS default keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
class IosLayoutDetector(LayoutDetector):
    """Layout detector for the iOS default keyboard. See `LayoutDetector` for
    more information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=".//XCUIElementTypeKeyboard",
            xpath_keys="(.//XCUIElementTypeKey|.//XCUIElementTypeButton)",
            android=False,
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        sections = [
            data for data in self.driver.page_source.split("<XCUIElementTypeOther") if "name=" in data.split(">")[0]
        ]
        is_typing_predictions_section = False
        for section in sections:
            m = re.search(r"name=\"([^\"]*)\"", section)
            if m:
                name = m.group(1)

                if name == "Typing Predictions":
                    is_typing_predictions_section = True
                    continue

                if is_typing_predictions_section:
                    suggestions.append(name.replace("“", "").replace("”", ""))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    sections = [
        data for data in self.driver.page_source.split("<XCUIElementTypeOther") if "name=" in data.split(">")[0]
    ]
    is_typing_predictions_section = False
    for section in sections:
        m = re.search(r"name=\"([^\"]*)\"", section)
        if m:
            name = m.group(1)

            if name == "Typing Predictions":
                is_typing_predictions_section = True
                continue

            if is_typing_predictions_section:
                suggestions.append(name.replace("“", "").replace("”", ""))

    return suggestions

KbkitproLayoutDetector

Bases: LayoutDetector

Layout detector for the KeyboardKit Pro demo keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
class KbkitproLayoutDetector(LayoutDetector):
    """Layout detector for the KeyboardKit Pro demo keyboard. See
    `LayoutDetector` for more information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=".//XCUIElementTypeOther[XCUIElementTypeButton and XCUIElementTypeTextField]",
            xpath_keys=".//XCUIElementTypeButton",
            android=False,
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        for data in self.driver.page_source.split("<XCUIElementTypeOther"):
            if "<XCUIElementTypeTextField" in data:
                pred_part = data.split("<XCUIElementTypeTextField")[0]
                if "<XCUIElementTypeButton" in pred_part and 'name="Add"' in pred_part:
                    for elem in pred_part.split(">")[2:]:
                        if "<XCUIElementTypeTextField" in elem:
                            break
                        m = re.search(r"name=\"([^\"]*)\"", elem)
                        if m:
                            name = m.group(1)
                            suggestions.append(name.replace("“", "").replace("”", ""))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    for data in self.driver.page_source.split("<XCUIElementTypeOther"):
        if "<XCUIElementTypeTextField" in data:
            pred_part = data.split("<XCUIElementTypeTextField")[0]
            if "<XCUIElementTypeButton" in pred_part and 'name="Add"' in pred_part:
                for elem in pred_part.split(">")[2:]:
                    if "<XCUIElementTypeTextField" in elem:
                        break
                    m = re.search(r"name=\"([^\"]*)\"", elem)
                    if m:
                        name = m.group(1)
                        suggestions.append(name.replace("“", "").replace("”", ""))

    return suggestions

KbkitossLayoutDetector

Bases: LayoutDetector

Layout detector for the KeyboardKit OSS demo keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
class KbkitossLayoutDetector(LayoutDetector):
    """Layout detector for the KeyboardKit OSS demo keyboard. See
    `LayoutDetector` for more information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=".//XCUIElementTypeOther[XCUIElementTypeButton and XCUIElementTypeStaticText]",
            xpath_keys=".//XCUIElementTypeButton",
            android=False,
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        for data in self.driver.page_source.split("<XCUIElementTypeOther"):
            if ", Subtitle" in data:
                pred_part = data.split(", Subtitle")[0]
                for elem in pred_part.split(">")[1:]:
                    m = re.search(r"name=\"([^\"]*)\"?", elem)
                    if m:
                        name = m.group(1)
                        suggestions.append(name.replace("“", "").replace("”", ""))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    for data in self.driver.page_source.split("<XCUIElementTypeOther"):
        if ", Subtitle" in data:
            pred_part = data.split(", Subtitle")[0]
            for elem in pred_part.split(">")[1:]:
                m = re.search(r"name=\"([^\"]*)\"?", elem)
                if m:
                    name = m.group(1)
                    suggestions.append(name.replace("“", "").replace("”", ""))

    return suggestions

SwiftkeyLayoutDetector

Bases: LayoutDetector

Layout detector for the Swiftkey keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
class SwiftkeyLayoutDetector(LayoutDetector):
    """Layout detector for the Swiftkey keyboard. See `LayoutDetector` for more
    information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=f"./*/*[@package='{KEYBOARD_PACKAGE[SWIFTKEY]}']",
            xpath_keys=".//*[@class='android.view.View'][@content-desc]",
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        # Get the raw content as text, weed out useless elements
        for data in self.driver.page_source.split("<android.widget.FrameLayout"):
            if "com.touchtype.swiftkey" in data and "<android.view.View " in data:
                sections = data.split("<android.view.View ")
                for section in sections[1:]:
                    m = re.search(r"content-desc=\"([^\"]*)\"", section)
                    if m:
                        suggestions.append(html.unescape(m.group(1)))
                break

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    # Get the raw content as text, weed out useless elements
    for data in self.driver.page_source.split("<android.widget.FrameLayout"):
        if "com.touchtype.swiftkey" in data and "<android.view.View " in data:
            sections = data.split("<android.view.View ")
            for section in sections[1:]:
                m = re.search(r"content-desc=\"([^\"]*)\"", section)
                if m:
                    suggestions.append(html.unescape(m.group(1)))
            break

    return suggestions

YandexLayoutDetector

Bases: LayoutDetector

Layout detector for the Yandex keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
class YandexLayoutDetector(LayoutDetector):
    """Layout detector for the Yandex keyboard. See `LayoutDetector` for more
    information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=f"./*/*[@package='{KEYBOARD_PACKAGE[YANDEX]}']",
            xpath_keys=".//*[@class='ya.d'][@content-desc]",
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        # Depending if we are on a real device or on emulator, the
        # Yandex keyboard uses different XML tags...
        if "<javaClass" in self.driver.page_source:  # Real device
            section = self.driver.page_source.split(f"{KEYBOARD_PACKAGE[YANDEX]}:id/drawable_suggest_container")[
                1
            ].split("</android.view.View>")[0]

            for line in section.split("\n"):
                if "<javaClass" in line:
                    m = re.search(r"content-desc=\"([^\"]*)\"", line)
                    if m:
                        suggestions.append(html.unescape(m.group(1)))
        else:  # Emulator
            for s in self.driver.page_source.split("android.widget.LinearLayout"):
                if f"{KEYBOARD_PACKAGE[YANDEX]}:id/kb_suggest_suggestions_container" in s:
                    suggestions_section = s
                    break

            for line in suggestions_section.split("\n"):
                if (
                    "kb_suggest_left_suggestion" in line
                    or "kb_suggest_center_suggestion" in line
                    or "kb_suggest_right_suggestion" in line
                ):
                    m = re.search(r"content-desc=\"([^\"]*)\"", line)
                    if m:
                        suggestions.append(html.unescape(m.group(1)))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    # Depending if we are on a real device or on emulator, the
    # Yandex keyboard uses different XML tags...
    if "<javaClass" in self.driver.page_source:  # Real device
        section = self.driver.page_source.split(f"{KEYBOARD_PACKAGE[YANDEX]}:id/drawable_suggest_container")[
            1
        ].split("</android.view.View>")[0]

        for line in section.split("\n"):
            if "<javaClass" in line:
                m = re.search(r"content-desc=\"([^\"]*)\"", line)
                if m:
                    suggestions.append(html.unescape(m.group(1)))
    else:  # Emulator
        for s in self.driver.page_source.split("android.widget.LinearLayout"):
            if f"{KEYBOARD_PACKAGE[YANDEX]}:id/kb_suggest_suggestions_container" in s:
                suggestions_section = s
                break

        for line in suggestions_section.split("\n"):
            if (
                "kb_suggest_left_suggestion" in line
                or "kb_suggest_center_suggestion" in line
                or "kb_suggest_right_suggestion" in line
            ):
                m = re.search(r"content-desc=\"([^\"]*)\"", line)
                if m:
                    suggestions.append(html.unescape(m.group(1)))

    return suggestions

TappaLayoutDetector

Bases: LayoutDetector

Layout detector for the Tappa keyboard. See LayoutDetector for more information.

Source code in kebbie/emulator.py
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
class TappaLayoutDetector(LayoutDetector):
    """Layout detector for the Tappa keyboard. See `LayoutDetector` for more
    information.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            xpath_root=f"./*/*[@package='{KEYBOARD_PACKAGE[TAPPA]}']",
            xpath_keys=".//com.mocha.keyboard.inputmethod.keyboard.Key",
            **kwargs,
        )

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        # Get the raw content as text, weed out useless elements
        section = self.driver.page_source.split(f"{KEYBOARD_PACKAGE[TAPPA]}:id/suggestions_strip")[1].split(
            "</android.widget.LinearLayout>"
        )[0]

        for line in section.split("\n"):
            if "<android.widget.TextView" in line:
                m = re.search(r"text=\"([^\"]*)\"", line)
                if m:
                    suggestions.append(html.unescape(m.group(1)))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    # Get the raw content as text, weed out useless elements
    section = self.driver.page_source.split(f"{KEYBOARD_PACKAGE[TAPPA]}:id/suggestions_strip")[1].split(
        "</android.widget.LinearLayout>"
    )[0]

    for line in section.split("\n"):
        if "<android.widget.TextView" in line:
            m = re.search(r"text=\"([^\"]*)\"", line)
            if m:
                suggestions.append(html.unescape(m.group(1)))

    return suggestions

FleksyLayoutDetector

Bases: LayoutDetector

Layout detector for the Fleksy keyboard. See LayoutDetector for more information.

Note that this class is only semi-automatically detected : the layout itself is not detected, but the suggestions are retrieved from the XML tree (no need to rely on OCR, much faster). The layout is hard-coded for now.

Source code in kebbie/emulator.py
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
class FleksyLayoutDetector(LayoutDetector):
    """Layout detector for the Fleksy keyboard. See `LayoutDetector` for more
    information.

    Note that this class is only semi-automatically detected : the layout
    itself is not detected, but the suggestions are retrieved from the XML tree
    (no need to rely on OCR, much faster). The layout is hard-coded for now.
    """

    def __init__(self, driver: webdriver.Remote):
        self.driver = driver

        # Adapt the layout to the screen
        w = FLEKSY_LAYOUT["keyboard_frame"][2]
        h = FLEKSY_LAYOUT["keyboard_frame"][3]
        self.layout = {"keyboard_frame": FLEKSY_LAYOUT["keyboard_frame"]}
        for layout_name in ["lowercase", "uppercase", "numbers"]:
            for key_name, key_frame in FLEKSY_LAYOUT[layout_name].items():
                if layout_name not in self.layout:
                    self.layout[layout_name] = {}
                self.layout[layout_name][key_name] = [
                    int(key_frame[0] * w),
                    int(key_frame[1] * h),
                    int(key_frame[2] * w),
                    int(key_frame[3] * h),
                ]

    def get_suggestions(self) -> List[str]:
        """Method to retrieve the keyboard suggestions from the XML tree.

        Returns:
            List of suggestions from the keyboard.
        """
        suggestions = []

        # Get the raw content as text, weed out useless elements
        sections = [
            s
            for s in self.driver.page_source.split("XCUIElementTypeOther")
            if "XCUIElementTypeStaticText" in s and "XCUIElementTypeButton" not in s
        ]

        for s in sections:
            m = re.search(r"name=\"([^\"]*)\"", s)
            if m:
                suggestions.append(html.unescape(m.group(1)))

        return suggestions

get_suggestions()

Method to retrieve the keyboard suggestions from the XML tree.

Returns:

Type Description
List[str]

List of suggestions from the keyboard.

Source code in kebbie/emulator.py
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
def get_suggestions(self) -> List[str]:
    """Method to retrieve the keyboard suggestions from the XML tree.

    Returns:
        List of suggestions from the keyboard.
    """
    suggestions = []

    # Get the raw content as text, weed out useless elements
    sections = [
        s
        for s in self.driver.page_source.split("XCUIElementTypeOther")
        if "XCUIElementTypeStaticText" in s and "XCUIElementTypeButton" not in s
    ]

    for s in sections:
        m = re.search(r"name=\"([^\"]*)\"", s)
        if m:
            suggestions.append(html.unescape(m.group(1)))

    return suggestions

gesture.py

Module containing the function make_swipe_gesture, which is used to create a natural-looking swipe gesture from a list of letter-points.

make_swipe_gesture(control_points)

Function to generate artificial swipe gesture from a list of points. The given points represents the typed letters on the keyboard. This function simply generate several other points between the control points. Points are generated using sequential Bezier curves. The resulting swipe gesture pass by the control points.

Parameters:

Name Type Description Default
control_points List[Tuple[float, float]]

Control points, representing the letter typed. The resulting swipe gesture will pass by these points.

required

Returns:

Type Description
List[Tuple[float, float]]

Points generated by the swipe gesture.

Source code in kebbie/gesture.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def make_swipe_gesture(control_points: List[Tuple[float, float]]) -> List[Tuple[float, float]]:
    """Function to generate artificial swipe gesture from a list of points.
    The given points represents the typed letters on the keyboard. This
    function simply generate several other points between the control points.
    Points are generated using sequential Bezier curves. The resulting swipe
    gesture pass by the control points.

    Args:
        control_points (List[Tuple[float, float]]): Control points,
            representing the letter typed. The resulting swipe gesture will
            pass by these points.

    Returns:
        Points generated by the swipe gesture.
    """
    gesture_points = [control_points[0]]

    # Pick a "style" (speed & acceleration) and keep it constant across the gesture
    speed = random.uniform(MIN_N_POINTS_PER_DIST, MAX_N_POINTS_PER_DIST)
    acceleration = random.uniform(MIN_ACCELERATION, MAX_ACCELERATION)

    # Generate bezier curves between each control points
    for p1, p2 in zip(control_points[:-1], control_points[1:]):
        # The distance between the 2 points will dictate the speed and radius
        d = euclidian_dist(p1, p2)
        radius = min(d, MAX_RADIUS)
        n_points = max(1, int(d * speed))

        linspace = accelerated_linspace(n_points, acceleration)

        # We don't want the curves to be straight between the control points,
        # so we generate random points to add curves
        p1_curv = random_point_around(p1, radius=radius)
        p2_curv = random_point_around(p2, radius=radius)

        # Make the bezier curve with the specified number of points
        xs, ys = bezier_curve([p2, p2_curv, p1_curv, p1], linspace=linspace)
        bezier_points = list(zip(xs, ys))

        # Make sure the control point p2 is here
        if bezier_points[-1] != p2:
            bezier_points.append(p2)
        # p1 was already added in the previous loop, no need to add it
        if bezier_points[0] == p1:
            bezier_points = bezier_points[1:]

        gesture_points.extend(bezier_points)

    return gesture_points

random_point_around(p, radius)

Generate a random point around the given point p, within the given radius.

Parameters:

Name Type Description Default
p Tuple[float, float]

Coordinates to use as a starting point.

required
radius float

Radius within the starting point to generate the random point.

required

Returns:

Type Description
Tuple[float, float]

Coordinates of the generated random point.

Source code in kebbie/gesture.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def random_point_around(p: Tuple[float, float], radius: float) -> Tuple[float, float]:
    """Generate a random point around the given point p, within the given
    radius.

    Args:
        p (Tuple[float, float]): Coordinates to use as a starting point.
        radius (float): Radius within the starting point to generate the random
            point.

    Returns:
        Coordinates of the generated random point.
    """
    rand_x = random.uniform(p[0] - radius, p[0] + radius)
    rand_y = random.uniform(p[1] - radius, p[1] + radius)
    return (rand_x, rand_y)

bernstein_poly(i, n, t)

The Bernstein polynomial of n, i as a function of t.

Taken from : https://stackoverflow.com/a/12644499/9494790

Parameters:

Name Type Description Default
i int

i

required
n int

n

required
t float

t

required

Returns:

Type Description
float

The computed value for this polynomial function.

Source code in kebbie/gesture.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def bernstein_poly(i: int, n: int, t: float) -> float:
    """The Bernstein polynomial of n, i as a function of t.

    Taken from : https://stackoverflow.com/a/12644499/9494790

    Args:
        i (int): i
        n (int): n
        t (float): t

    Returns:
        The computed value for this polynomial function.
    """
    return comb(n, i) * (t ** (n - i)) * (1 - t) ** i

bezier_curve(control_points, linspace)

Given a set of control points, return the bezier curve defined by the control points.

See : http://processingjs.nihongoresources.com/bezierinfo/

Taken from : https://stackoverflow.com/a/12644499/9494790

Parameters:

Name Type Description Default
control_points List[Tuple[float, float]]

Control points used to generate the bezier curve.

required
linspace List[float]

Linspace to use for sampling points across the Bezier curve.

required

Returns:

Type Description
Tuple[List[float], List[float]]

Sampled points along the bezier curve.

Source code in kebbie/gesture.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def bezier_curve(control_points: List[Tuple[float, float]], linspace: List[float]) -> Tuple[List[float], List[float]]:
    """Given a set of control points, return the bezier curve defined by the
    control points.

    See : http://processingjs.nihongoresources.com/bezierinfo/

    Taken from : https://stackoverflow.com/a/12644499/9494790

    Args:
        control_points (List[Tuple[float, float]]): Control points used to
            generate the bezier curve.
        linspace (List[float]): Linspace to use for sampling points across the
            Bezier curve.

    Returns:
        Sampled points along the bezier curve.
    """
    n_points = len(control_points)
    x_points = np.array([p[0] for p in control_points])
    y_points = np.array([p[1] for p in control_points])

    polynomial_array = np.array([bernstein_poly(i, n_points - 1, linspace) for i in range(0, n_points)])

    x_vals = np.dot(x_points, polynomial_array)
    y_vals = np.dot(y_points, polynomial_array)

    return x_vals, y_vals

accelerated_linspace(n, acceleration)

Alternative to np.linspace, instead of giving a range of number evenly distributed, this one is not evenly distributed, and simulate an acceleration at first, and then a deceleration.

Parameters:

Name Type Description Default
n int

Number of points to generate in the linspace.

required
acceleration float

A number that dictate how constant the acceleration is. The lower, the more S-shape is used.

required

Returns:

Type Description
List[float]

Generated points.

Source code in kebbie/gesture.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def accelerated_linspace(n: int, acceleration: float) -> List[float]:
    """Alternative to np.linspace, instead of giving a range of number evenly
    distributed, this one is not evenly distributed, and simulate an
    acceleration at first, and then a deceleration.

    Args:
        n (int): Number of points to generate in the linspace.
        acceleration (float): A number that dictate how constant the
            acceleration is. The lower, the more S-shape is used.

    Returns:
        Generated points.
    """

    def norm(x):
        nom = x - x.min()
        denom = x.max() - x.min()
        return nom / denom

    def sigmoid(x, k):
        return 1 / (1 + np.exp(-x / k))

    linspace = np.linspace(-1.0, 1.0, n)

    if n <= 1:
        return linspace
    else:
        return norm(sigmoid(linspace, k=acceleration))

layout.py

Module containing the helpers LayoutHelper, useful class to deal with the layout of a keyboard, access key positions, etc...

KeyInfo dataclass

Structure containing all information needed for a given character (key).

Parameters:

Name Type Description Default
klayer_id int

Keyboard Layer ID where this key is located.

required
width float

Width of the key.

required
height float

Height of the key.

required
center Tuple[float, float]

Center position (x, y coordinates) of the key.

required
Source code in kebbie/layout.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class KeyInfo:
    """Structure containing all information needed for a given character (key).

    Args:
        klayer_id (int): Keyboard Layer ID where this key is located.
        width (float): Width of the key.
        height (float): Height of the key.
        center (Tuple[float, float]): Center position (x, y coordinates) of the
            key.
    """

    klayer_id: int
    width: float
    height: float
    center: Tuple[float, float]

Key dataclass

Structure containing information needed for each key of a given keyboard layer.

Parameters:

Name Type Description Default
char str

Character associated with this key.

required
bounds Dict[str, float]

Dictionary representing the bounding box of the key. The dictionary should contains the following keys : right, left, top, bottom.

required
Source code in kebbie/layout.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@dataclass
class Key:
    """Structure containing information needed for each key of a given keyboard
    layer.

    Args:
        char (str): Character associated with this key.
        bounds (Dict[str, float]): Dictionary representing the bounding box of
            the key. The dictionary should contains the following keys :
            `right`, `left`, `top`, `bottom`.
    """

    char: str
    bounds: Dict[str, float]

LayoutHelper

Small class that represents a Keyboard layout. The goal of this class is to offer some easy-to-use method to deal with a keyboard layout.

Parameters:

Name Type Description Default
lang str

Language of the layout to load.

'en-US'
custom_keyboard Dict

If provided, instead of relying on the keyboard layout provided by default, uses the given keyboard layout.

None
ignore_layers_after Optional[int])

Ignore higher layers of the keyboard layout. If None is given, no layer is ignored.

None
Source code in kebbie/layout.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class LayoutHelper:
    """Small class that represents a Keyboard layout. The goal of this class is
    to offer some easy-to-use method to deal with a keyboard layout.

    Args:
        lang (str, optional): Language of the layout to load.
        custom_keyboard (Dict, optional): If provided, instead of relying on
            the keyboard layout provided by default, uses the given keyboard
            layout.
        ignore_layers_after (Optional[int]) : Ignore higher layers of the
            keyboard layout. If `None` is given, no layer is ignored.
    """

    def __init__(self, lang: str = "en-US", custom_keyboard: Dict = None, ignore_layers_after: Optional[int] = None):
        keyboard = custom_keyboard if custom_keyboard is not None else load_keyboard(lang)
        self.keys_info, self.klayers_info, self.accents = self._extract_infos(keyboard["layout"], ignore_layers_after)
        self.letter_accents = [c for c in self.accents if re.match(r"^[\pL]+$", c)]
        self.spelling_symbols = keyboard["settings"]["allowed_symbols_in_words"]
        self.layout_name = keyboard["keyboard"]["default-layout"]

    def _extract_infos(  # noqa: C901
        self, keyboard_layout: Dict, ignore_layers_after: Optional[int] = None
    ) -> Tuple[Dict[str, KeyInfo], Dict[int, Key], List[str]]:
        """This method reads the given keyboard layout, and extract useful data
        structures from this (to be used later by other methods). This
        basically builds the LayoutHelper class (and should be used only inside
        the constructor).

        Note:
            The given keyboard layout contains 24 layers. Each key appears in
            one (or several) layer of the keyboard. Accents are associated to
            the same key as their non-accented version.
            This class may be used to generate typing noise, so accents should
            have their own keys (and closer accents should be represented by
            closer keys). This method takes care of it, by generating "virtual
            keyboard layers", for each group of accents. The goal is to
            generate a virtual keyboard layer that is as close as possible as
            the actual keyboard, used by real-users.

        Args:
            keyboard_layout (Dict): Dictionary representing the keyboard and
                its layout.
            ignore_layers_after (Optional[int]) : Ignore higher layers of the
                keyboard layout. If `None` is given, no layer is ignored.

        Returns:
            Key information for each character in the keyboard.
            Key information for each layer of the keyboard.
            List of accents used in the keyboard.
        """
        keys_info = {}  # Dict char -> key infos (bounds, center, klayer ID)
        klayers_info = defaultdict(list)  # Dict klayer ID -> list of keys (bounds, char)
        all_accents = set()

        # A keyboard layout is made of several "layers", each identified by a KeyboardID
        last_klayer_id = len(keyboard_layout)
        for klayer in keyboard_layout:
            if klayer["buttons"] is None or (ignore_layers_after is not None and klayer["id"] > ignore_layers_after):
                continue

            # Each layer is a list of button
            for button in klayer["buttons"]:
                # Button always have a character, and optionally accents
                char, accents = button["labels"][0], button["labels"][1:]

                # Special characters : space, shift, numbers, magic, etc...
                if button["type"] != 1:
                    if char.lower() == SPACE:
                        char = " "
                    elif char == POINT:
                        # Points should be added to our key infos
                        pass
                    else:
                        # Other special characters are ignored
                        char = None

                if char is None:
                    continue

                # Save the character and its key information
                # Save it only if it's not already in a previous klayer
                if char not in keys_info or keys_info[char].klayer_id > klayer["id"]:
                    keys_info[char] = KeyInfo(
                        klayer["id"],
                        button["boundingRect"]["right"] - button["boundingRect"]["left"],
                        button["boundingRect"]["bottom"] - button["boundingRect"]["top"],
                        (button["centerPoint"]["x"], button["centerPoint"]["y"]),
                    )
                # But always save its info in the klayers info
                klayers_info[klayer["id"]].append(Key(char, button["boundingRect"]))

                # Then, save the accents if any
                for i, char_accent in enumerate(accents):
                    all_accents.add(char_accent)

                    # Create a virtual position for the accent
                    bounds, center = self._make_virtual_key(i, button["boundingRect"])

                    # Save the accent (only if not existing) in a new virtual klayer
                    if char_accent not in keys_info:
                        keys_info[char_accent] = KeyInfo(
                            last_klayer_id,
                            bounds["right"] - bounds["left"],
                            bounds["bottom"] - bounds["top"],
                            (center["x"], center["y"]),
                        )
                    # But always saveits info in the klayers info
                    klayers_info[last_klayer_id].append(Key(char_accent, bounds))

                # If we added some accent in a virtual klayer, don't forget to update the last klayer ID
                if accents:
                    last_klayer_id += 1

        return keys_info, klayers_info, sorted(all_accents)

    def _make_virtual_key(
        self, idx: int, initial_bounds: Dict[str, float]
    ) -> Tuple[Dict[str, float], Dict[str, float]]:
        """Method to create a new boundary for an accented character. Based on
        the given id, the generated boundary box will be generated at a
        different position.

        This method tries to follow a similar pattern as the sample app, with
        accents appearing in lines of 4 accents.

        Args:
            idx (int): The index of the bounding box to generate.
            initial_bounds (Dict[str, float]): The bounding box of the
                non-accented key.

        Returns:
            Generated bounding box.
            Its associated center position.
        """
        width = initial_bounds["right"] - initial_bounds["left"]
        height = initial_bounds["bottom"] - initial_bounds["top"]

        start_x = initial_bounds["left"] + (idx % N_ACCENT_PER_LINE) * width
        start_y = initial_bounds["bottom"] - (idx // N_ACCENT_PER_LINE) * height

        bounds = {
            "bottom": start_y,
            "left": start_x,
            "right": start_x + width,
            "top": start_y - height,
        }
        center = {
            "x": bounds["left"] + width / 2,
            "y": bounds["top"] + height / 2,
        }
        return bounds, center

    def get_key_info(self, char: str) -> Tuple[float, float, float, float, int]:
        """Method to retrieve the information associated to a specific key.

        Args:
            char (str): Character for which to retrieve key information.

        Raises:
            KeyError: Exception raised if the given character can't be typed (
                because it doesn't exist on this keyboard layout).

        Returns:
            Width of the key for the requested character.
            Height of the key for the requested character.
            Center position (x-axis) of the key for the requested character.
            Center position (y-axis) of the key for the requested character.
            Keyboard layer ID where the character's key is located.
        """
        k = self.keys_info[char]
        return k.width, k.height, k.center[0], k.center[1], k.klayer_id

    def get_key(self, pos: Tuple[float, float], klayer_id: int) -> str:
        """Get the character associated with the given position.

        Args:
            pos (Tuple[float, float]): Position (x, y) in the keyboard.
            klayer_id (int): Keyboard layer ID to use.

        Returns:
            Character associated to the given position.
        """
        klayer = self.klayers_info[klayer_id]

        try:
            # Retrieve the key that contains the sampled position
            key = next(
                k
                for k in klayer
                if k.bounds["left"] <= pos[0] <= k.bounds["right"] and k.bounds["top"] <= pos[1] <= k.bounds["bottom"]
            )
        except StopIteration:
            # Maybe the sampled position was out of bound -> retrieve the closest key
            key = min(
                klayer,
                key=lambda k: euclidian_dist(
                    pos,
                    (
                        k.bounds["left"] + (k.bounds["right"] - k.bounds["left"]) / 2,
                        k.bounds["top"] + (k.bounds["bottom"] - k.bounds["top"]) / 2,
                    ),
                ),
            )

        return key.char

_extract_infos(keyboard_layout, ignore_layers_after=None)

This method reads the given keyboard layout, and extract useful data structures from this (to be used later by other methods). This basically builds the LayoutHelper class (and should be used only inside the constructor).

Note

The given keyboard layout contains 24 layers. Each key appears in one (or several) layer of the keyboard. Accents are associated to the same key as their non-accented version. This class may be used to generate typing noise, so accents should have their own keys (and closer accents should be represented by closer keys). This method takes care of it, by generating "virtual keyboard layers", for each group of accents. The goal is to generate a virtual keyboard layer that is as close as possible as the actual keyboard, used by real-users.

Parameters:

Name Type Description Default
keyboard_layout Dict

Dictionary representing the keyboard and its layout.

required
ignore_layers_after Optional[int])

Ignore higher layers of the keyboard layout. If None is given, no layer is ignored.

None

Returns:

Type Description
Dict[str, KeyInfo]

Key information for each character in the keyboard.

Dict[int, Key]

Key information for each layer of the keyboard.

List[str]

List of accents used in the keyboard.

Source code in kebbie/layout.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def _extract_infos(  # noqa: C901
    self, keyboard_layout: Dict, ignore_layers_after: Optional[int] = None
) -> Tuple[Dict[str, KeyInfo], Dict[int, Key], List[str]]:
    """This method reads the given keyboard layout, and extract useful data
    structures from this (to be used later by other methods). This
    basically builds the LayoutHelper class (and should be used only inside
    the constructor).

    Note:
        The given keyboard layout contains 24 layers. Each key appears in
        one (or several) layer of the keyboard. Accents are associated to
        the same key as their non-accented version.
        This class may be used to generate typing noise, so accents should
        have their own keys (and closer accents should be represented by
        closer keys). This method takes care of it, by generating "virtual
        keyboard layers", for each group of accents. The goal is to
        generate a virtual keyboard layer that is as close as possible as
        the actual keyboard, used by real-users.

    Args:
        keyboard_layout (Dict): Dictionary representing the keyboard and
            its layout.
        ignore_layers_after (Optional[int]) : Ignore higher layers of the
            keyboard layout. If `None` is given, no layer is ignored.

    Returns:
        Key information for each character in the keyboard.
        Key information for each layer of the keyboard.
        List of accents used in the keyboard.
    """
    keys_info = {}  # Dict char -> key infos (bounds, center, klayer ID)
    klayers_info = defaultdict(list)  # Dict klayer ID -> list of keys (bounds, char)
    all_accents = set()

    # A keyboard layout is made of several "layers", each identified by a KeyboardID
    last_klayer_id = len(keyboard_layout)
    for klayer in keyboard_layout:
        if klayer["buttons"] is None or (ignore_layers_after is not None and klayer["id"] > ignore_layers_after):
            continue

        # Each layer is a list of button
        for button in klayer["buttons"]:
            # Button always have a character, and optionally accents
            char, accents = button["labels"][0], button["labels"][1:]

            # Special characters : space, shift, numbers, magic, etc...
            if button["type"] != 1:
                if char.lower() == SPACE:
                    char = " "
                elif char == POINT:
                    # Points should be added to our key infos
                    pass
                else:
                    # Other special characters are ignored
                    char = None

            if char is None:
                continue

            # Save the character and its key information
            # Save it only if it's not already in a previous klayer
            if char not in keys_info or keys_info[char].klayer_id > klayer["id"]:
                keys_info[char] = KeyInfo(
                    klayer["id"],
                    button["boundingRect"]["right"] - button["boundingRect"]["left"],
                    button["boundingRect"]["bottom"] - button["boundingRect"]["top"],
                    (button["centerPoint"]["x"], button["centerPoint"]["y"]),
                )
            # But always save its info in the klayers info
            klayers_info[klayer["id"]].append(Key(char, button["boundingRect"]))

            # Then, save the accents if any
            for i, char_accent in enumerate(accents):
                all_accents.add(char_accent)

                # Create a virtual position for the accent
                bounds, center = self._make_virtual_key(i, button["boundingRect"])

                # Save the accent (only if not existing) in a new virtual klayer
                if char_accent not in keys_info:
                    keys_info[char_accent] = KeyInfo(
                        last_klayer_id,
                        bounds["right"] - bounds["left"],
                        bounds["bottom"] - bounds["top"],
                        (center["x"], center["y"]),
                    )
                # But always saveits info in the klayers info
                klayers_info[last_klayer_id].append(Key(char_accent, bounds))

            # If we added some accent in a virtual klayer, don't forget to update the last klayer ID
            if accents:
                last_klayer_id += 1

    return keys_info, klayers_info, sorted(all_accents)

_make_virtual_key(idx, initial_bounds)

Method to create a new boundary for an accented character. Based on the given id, the generated boundary box will be generated at a different position.

This method tries to follow a similar pattern as the sample app, with accents appearing in lines of 4 accents.

Parameters:

Name Type Description Default
idx int

The index of the bounding box to generate.

required
initial_bounds Dict[str, float]

The bounding box of the non-accented key.

required

Returns:

Type Description
Dict[str, float]

Generated bounding box.

Dict[str, float]

Its associated center position.

Source code in kebbie/layout.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def _make_virtual_key(
    self, idx: int, initial_bounds: Dict[str, float]
) -> Tuple[Dict[str, float], Dict[str, float]]:
    """Method to create a new boundary for an accented character. Based on
    the given id, the generated boundary box will be generated at a
    different position.

    This method tries to follow a similar pattern as the sample app, with
    accents appearing in lines of 4 accents.

    Args:
        idx (int): The index of the bounding box to generate.
        initial_bounds (Dict[str, float]): The bounding box of the
            non-accented key.

    Returns:
        Generated bounding box.
        Its associated center position.
    """
    width = initial_bounds["right"] - initial_bounds["left"]
    height = initial_bounds["bottom"] - initial_bounds["top"]

    start_x = initial_bounds["left"] + (idx % N_ACCENT_PER_LINE) * width
    start_y = initial_bounds["bottom"] - (idx // N_ACCENT_PER_LINE) * height

    bounds = {
        "bottom": start_y,
        "left": start_x,
        "right": start_x + width,
        "top": start_y - height,
    }
    center = {
        "x": bounds["left"] + width / 2,
        "y": bounds["top"] + height / 2,
    }
    return bounds, center

get_key_info(char)

Method to retrieve the information associated to a specific key.

Parameters:

Name Type Description Default
char str

Character for which to retrieve key information.

required

Raises:

Type Description
KeyError

Exception raised if the given character can't be typed ( because it doesn't exist on this keyboard layout).

Returns:

Type Description
float

Width of the key for the requested character.

float

Height of the key for the requested character.

float

Center position (x-axis) of the key for the requested character.

float

Center position (y-axis) of the key for the requested character.

int

Keyboard layer ID where the character's key is located.

Source code in kebbie/layout.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def get_key_info(self, char: str) -> Tuple[float, float, float, float, int]:
    """Method to retrieve the information associated to a specific key.

    Args:
        char (str): Character for which to retrieve key information.

    Raises:
        KeyError: Exception raised if the given character can't be typed (
            because it doesn't exist on this keyboard layout).

    Returns:
        Width of the key for the requested character.
        Height of the key for the requested character.
        Center position (x-axis) of the key for the requested character.
        Center position (y-axis) of the key for the requested character.
        Keyboard layer ID where the character's key is located.
    """
    k = self.keys_info[char]
    return k.width, k.height, k.center[0], k.center[1], k.klayer_id

get_key(pos, klayer_id)

Get the character associated with the given position.

Parameters:

Name Type Description Default
pos Tuple[float, float]

Position (x, y) in the keyboard.

required
klayer_id int

Keyboard layer ID to use.

required

Returns:

Type Description
str

Character associated to the given position.

Source code in kebbie/layout.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def get_key(self, pos: Tuple[float, float], klayer_id: int) -> str:
    """Get the character associated with the given position.

    Args:
        pos (Tuple[float, float]): Position (x, y) in the keyboard.
        klayer_id (int): Keyboard layer ID to use.

    Returns:
        Character associated to the given position.
    """
    klayer = self.klayers_info[klayer_id]

    try:
        # Retrieve the key that contains the sampled position
        key = next(
            k
            for k in klayer
            if k.bounds["left"] <= pos[0] <= k.bounds["right"] and k.bounds["top"] <= pos[1] <= k.bounds["bottom"]
        )
    except StopIteration:
        # Maybe the sampled position was out of bound -> retrieve the closest key
        key = min(
            klayer,
            key=lambda k: euclidian_dist(
                pos,
                (
                    k.bounds["left"] + (k.bounds["right"] - k.bounds["left"]) / 2,
                    k.bounds["top"] + (k.bounds["bottom"] - k.bounds["top"]) / 2,
                ),
            ),
        )

    return key.char

noise_model.py

Module defining the NoiseModel class, which takes care of introducing typos in a clean text (and later see if the model can properly correct these typos).

Typo

Bases: Enum

Enum listing all possible typos that can be introduced.

Source code in kebbie/noise_model.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class Typo(Enum):
    """Enum listing all possible typos that can be introduced."""

    # Deletions
    DELETE_SPELLING_SYMBOL = "DELETE_SPELLING_SYMBOL"
    DELETE_SPACE = "DELETE_SPACE"
    DELETE_PUNCTUATION = "DELETE_PUNCTUATION"
    DELETE_CHAR = "DELETE_CHAR"

    # Additions
    ADD_SPELLING_SYMBOL = "ADD_SPELLING_SYMBOL"
    ADD_SPACE = "ADD_SPACE"
    ADD_PUNCTUATION = "ADD_PUNCTUATION"
    ADD_CHAR = "ADD_CHAR"

    # Substitutions
    SUBSTITUTE_CHAR = "SUBSTITUTE_CHAR"

    # Simplifications
    SIMPLIFY_ACCENT = "SIMPLIFY_ACCENT"
    SIMPLIFY_CASE = "SIMPLIFY_CASE"

    # Transposition
    TRANSPOSE_CHAR = "TRANSPOSE_CHAR"

    # Common typos
    COMMON_TYPO = "COMMON_TYPO"

NoiseModel

Class responsible for introducing typo in a clean text.

Most of typos are introduced on text directly. Then fuzzy typing is applied, using two Gaussian distributions (for x-axis and y-axis), mimicking a user typing on a soft keyboard.

The ratio arguments are here to choose how wide the Gaussian distribution is. A wider distribution will be less precise, a narrower distribution will be more precise. To test how wide a ratio is, run the following code :

from scipy.stats import norm

def compute(x):
    cdf = norm.cdf(x)
    return cdf - (1 - cdf)

print(compute(2.32))    # >>> 0.9796591226625606
So in this case, a ratio of 2.32 gives a precision of ~98% (a typo will be introduced in 2% of the cases).

Parameters:

Name Type Description Default
lang str

Language used.

required
custom_keyboard Dict

If provided, instead of relying on the keyboard layout provided by default, uses the given keyboard layout.

None
common_typos Optional[Dict[str, List[str]]]

Dictionary of common typos. If None, common typos are not used.

None
typo_probs Optional[Dict[str, float]]

Probabilities for each type of typos. If None is given, DEFAULT_TYPO_PROBS is used.

None
x_offset float

Parameter for the Gaussian distribution for the fuzzy typing. Base position offset on the x-axis.

0
y_offset float

Parameter for the Gaussian distribution for the fuzzy typing. Base position offset on the y-axis.

0
x_ratio float

Parameter for the Gaussian distribution for the fuzzy typing. It controls how wide the distribution is on the x-axis, which is the precision of the typing.

DEFAULT_SIGMA_RATIO
y_ratio float

Parameter for the Gaussian distribution for the fuzzy typing. It controls how wide the distribution is on the y-axis, which is the precision of the typing.

DEFAULT_SIGMA_RATIO
Source code in kebbie/noise_model.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
class NoiseModel:
    """Class responsible for introducing typo in a clean text.

    Most of typos are introduced on text directly. Then fuzzy typing is
    applied, using two Gaussian distributions (for x-axis and y-axis),
    mimicking a user typing on a soft keyboard.

    The ratio arguments are here to choose how wide the Gaussian distribution
    is. A wider distribution will be less precise, a narrower distribution will
    be more precise. To test how wide a ratio is, run the following code :
    ```
    from scipy.stats import norm

    def compute(x):
        cdf = norm.cdf(x)
        return cdf - (1 - cdf)

    print(compute(2.32))    # >>> 0.9796591226625606
    ```
    So in this case, a ratio of `2.32` gives a precision of ~98% (a typo will
    be introduced in 2% of the cases).

    Args:
        lang (str): Language used.
        custom_keyboard (Dict, optional): If provided, instead of relying on
            the keyboard layout provided by default, uses the given keyboard
            layout.
        common_typos (Optional[Dict[str, List[str]]], optional): Dictionary of
            common typos. If `None`, common typos are not used.
        typo_probs (Optional[Dict[str, float]], optional): Probabilities for
            each type of typos. If `None` is given, `DEFAULT_TYPO_PROBS` is
            used.
        x_offset (float, optional): Parameter for the Gaussian distribution for
            the fuzzy typing. Base position offset on the x-axis.
        y_offset (float, optional): Parameter for the Gaussian distribution for
            the fuzzy typing. Base position offset on the y-axis.
        x_ratio (float, optional): Parameter for the Gaussian distribution for
            the fuzzy typing. It controls how wide the distribution is on the
            x-axis, which is the precision of the typing.
        y_ratio (float, optional): Parameter for the Gaussian distribution for
            the fuzzy typing. It controls how wide the distribution is on the
            y-axis, which is the precision of the typing.
    """

    def __init__(
        self,
        lang: str,
        custom_keyboard: Dict = None,
        common_typos: Optional[Dict[str, List[str]]] = None,
        typo_probs: Optional[Dict[str, float]] = None,
        x_offset: float = 0,
        y_offset: float = 0,
        x_ratio: float = DEFAULT_SIGMA_RATIO,
        y_ratio: float = DEFAULT_SIGMA_RATIO,
    ):
        self.lang = lang
        self.x_offset, self.y_offset = x_offset, y_offset
        self.x_ratio, self.y_ratio = x_ratio, y_ratio
        self.klayout = LayoutHelper(self.lang, custom_keyboard=custom_keyboard, ignore_layers_after=3)
        self.probs = typo_probs if typo_probs is not None else DEFAULT_TYPO_PROBS
        self.common_typos = common_typos if common_typos is not None else self._get_common_typos()

    def type_till_space(
        self,
        words: List[str],
    ) -> Tuple[
        List[Optional[Tuple[float, float]]],
        str,
        int,
        List[Typo],
    ]:
        """Method introducing typos word by word.

        This method receives a list of words, and type these words while
        introducing typos.
        So most of the time, only one word will be typed and the method will
        return. In some cases, the space is mistyped or deleted, so two words
        are typed.

        Args:
            words (List[str]): List of words to type.

        Returns:
            List of keystrokes (may contains some None).
            The typed characters as string.
            The number of words typed.
            The list of typos introduced in the string typed.
        """
        all_keystrokes = []
        all_typed_char = ""
        all_typos = []

        for i, word in enumerate(words):
            # Some words can't be corrected (numbers, symbols, etc...) -> Don't introduce typos
            error_free = False if self._is_correctable(word) else True

            # Add typos in the word
            noisy_word, typos = self._introduce_typos(word, error_free=error_free)
            all_typos += typos

            # Type the word (fuzzy)
            keystrokes, typed_char, typos = self._fuzzy_type(noisy_word, error_free=error_free)
            all_keystrokes += keystrokes
            all_typed_char += typed_char
            all_typos += typos

            # Then, we try to type a space (separator between words)
            # TODO : Modify this part for languages without space
            noisy_space, sp_typo_1 = self._introduce_typos(SPACE)
            keystrokes, typed_char, sp_typo_2 = self._fuzzy_type(noisy_space)

            # If the space is correctly typed, return now, otherwise type the next word
            if not sp_typo_1 and not sp_typo_2:
                break
            else:
                all_keystrokes += keystrokes
                all_typed_char += typed_char
                all_typos += sp_typo_1 + sp_typo_2

        return all_keystrokes, all_typed_char, i + 1, all_typos

    def swipe(self, word: str) -> Optional[List[Tuple[float, float]]]:
        """Method for creating an artificial swipe gesture given a word.

        Args:
            word (str): Word to type with a swipe gesture.

        Returns:
            Positions (x, y) of the generated swipe gesture, or None if the
                swipe gesture couldn't be created.
        """
        # Some words can't be corrected (numbers, symbols, etc...) -> Don't introduce typos
        error_free = False if self._is_correctable(word) else True

        # Get the core keystrokes (fuzzy)
        keystrokes, *_ = self._fuzzy_type(word, error_free=error_free)

        # If we can swipe that word, create the corresponding artificial gesture
        if all(keystrokes) and len(keystrokes) > 1:
            return make_swipe_gesture(keystrokes)
        else:
            return None

    def _introduce_typos(self, word: str, error_free: bool = False) -> Tuple[str, List[Typo]]:  # noqa: C901
        """Method to introduce typos in a given string.

        Either the word is changed into an existing common typo, or the word is
        processed as a stream of characters, each character having a chance of
        being mistyped.
        This method only add regular typos (deletions, additions, etc...), and
        is not introducing fuzzy typing.

        Args:
            word (str): Clean string where to add typos.
            error_free (bool): If set to True, don't introduce typo. Defaults
                to False.

        Returns:
            The noisy string.
            The list of typos introduced.
        """
        if error_free:
            return word, []

        # First of all, we either consider the word as a unit and introduce a
        # language-specific common typo (if available), or treat the word as a
        # sequence of character, where each character can have a typo
        if word in self.common_typos and sample(self.probs[Typo.COMMON_TYPO]):
            # Introduce a common typo
            return random.choice(self.common_typos[word]), [Typo.COMMON_TYPO]

        # From here, treat the word as a stream of characters, and potentially
        # add typos for each character
        noisy_word = ""
        typos = []
        word_chars = list(word)
        for i, char in enumerate(word_chars):
            # First, potentially apply simplifications (removing accent, or
            # lowercasing an uppercase character)
            # Note that if the full word is uppercase, we don't apply lowercase
            # simplification (doesn't feel like a natural typo a user would do)
            if char in self.klayout.letter_accents and sample(self.probs[Typo.SIMPLIFY_ACCENT]):
                char = strip_accents(char)
                typos.append(Typo.SIMPLIFY_ACCENT)
            if char.isupper() and len(word) > 1 and not word.isupper() and sample(self.probs[Typo.SIMPLIFY_CASE]):
                char = char.lower()
                typos.append(Typo.SIMPLIFY_CASE)

            # Check if this character exists on our keyboard
            try:
                *_, klayer_id = self.klayout.get_key_info(char)
                char_is_on_kb = True
                char_is_on_default_kb = klayer_id == 0
            except KeyError:
                char_is_on_kb = char_is_on_default_kb = False

            # Then, add the possible typo depending on the character type
            events = []
            is_first_char = bool(i == 0)
            is_last_char = bool(i >= (len(word_chars) - 1))
            if char.isnumeric() or not char_is_on_kb:
                # Don't introduce typos for numbers or symbols that are not on keyboard
                pass
            else:
                if not is_last_char:
                    # Only transpose char if they are on the same keyboard layer
                    try:
                        *_, next_char_klayer_id = self.klayout.get_key_info(word[i + 1])
                    except KeyError:
                        next_char_klayer_id = None

                    if klayer_id == next_char_klayer_id:
                        events.append(Typo.TRANSPOSE_CHAR)
                if char in self.klayout.spelling_symbols:
                    events.append(Typo.DELETE_SPELLING_SYMBOL)
                    events.append(Typo.ADD_SPELLING_SYMBOL)
                elif char.isspace():
                    events.append(Typo.DELETE_SPACE)
                    events.append(Typo.ADD_SPACE)
                elif char in string.punctuation:
                    events.append(Typo.DELETE_PUNCTUATION)
                    events.append(Typo.ADD_PUNCTUATION)
                elif char_is_on_default_kb:
                    events.append(Typo.DELETE_CHAR)
                    events.append(Typo.ADD_CHAR)

            # If it's the last character (and we are not typing a space),
            # don't add deletions typos, because it's an auto-completion case,
            # not auto-correction
            if is_last_char and word != SPACE:
                events = [e for e in events if e not in DELETIONS]

            # Get the probabilities for these possible events
            typo_probs = {e: self.probs[e] for e in events}
            if is_first_char:
                # Deleting the first character of the word is not so common, update the probabilities accordingly
                typo_probs = {e: p * FRONT_DELETION_MULTIPLIER if e in DELETIONS else p for e, p in typo_probs.items()}

            # And sample one of them
            typo = sample_among(typo_probs)

            # Process the typo
            if typo is Typo.TRANSPOSE_CHAR:
                noisy_char = word_chars[i + 1]
                word_chars[i + 1] = char
            elif typo in [Typo.DELETE_SPELLING_SYMBOL, Typo.DELETE_SPACE, Typo.DELETE_PUNCTUATION, Typo.DELETE_CHAR]:
                noisy_char = ""
            elif typo in [Typo.ADD_SPELLING_SYMBOL, Typo.ADD_SPACE, Typo.ADD_PUNCTUATION, Typo.ADD_CHAR]:
                noisy_char = f"{char}{char}"
            else:  # No typo
                noisy_char = char

            noisy_word += noisy_char
            if typo is not None:
                typos.append(typo)

        return noisy_word, typos

    def _fuzzy_type(
        self, word: str, error_free: bool = False
    ) -> Tuple[List[Optional[Tuple[float, float]]], str, List[Typo]]:
        """Method adding fuzzy typing.

        This method takes a string (potentially already noisy from other type
        of typos), and fuzzy-type it : simulate a user on a soft-keyboard.
        This "fat-finger syndrom" is simulated using two Gaussian
        distributions, one for each axis (x, y).
        This method also returns the generated keystrokes (positions on the
        keyboard), but only for the default keyboard (ID = 0). Keystrokes from
        other keyboard are set to None.

        Args:
            word (str): String to fuzzy-type.
            error_free (bool): If set to True, don't introduce typo. Defaults
                to False.

        Returns:
            List of keystrokes.
            Fuzzy string (corresponding to the keystrokes).
            List of typos introduced.
        """
        fuzzy_word = ""
        keystrokes = []
        typos = []

        # Type word character by character
        for char in word:
            try:
                width, height, x_center, y_center, klayer_id = self.klayout.get_key_info(char)
            except KeyError:
                # This character doesn't exist on the current keyboard
                # Just type it without introducing typo, like if the user copy-pasted it
                keystrokes.append(None)
                fuzzy_word += char
                continue

            # Sample a keystroke for this character
            # Note that we don't generate typos for characters outside of the default keyboard
            if error_free or klayer_id != 0:
                keystroke = (x_center, y_center)
            else:
                # Compute mu and sigma for the Normal distribution
                x_mu = x_center + self.x_offset
                y_mu = y_center + self.y_offset
                x_sigma = (width / 2) / self.x_ratio
                y_sigma = (height / 2) / self.y_ratio

                # Sample a position (x and y)
                keystroke = (random.gauss(x_mu, x_sigma), random.gauss(y_mu, y_sigma))

            # Convert it back to a character, to see where we tapped
            fuzzy_char = self.klayout.get_key(keystroke, klayer_id)

            # Save it (save the keystroke only if part of the default keyboard)
            keystrokes.append(keystroke if klayer_id == 0 else None)
            fuzzy_word += fuzzy_char
            if fuzzy_char != char:
                typos.append(Typo.SUBSTITUTE_CHAR)

        return keystrokes, fuzzy_word, typos

    def _is_correctable(self, word: str) -> bool:
        """Method returning True if we expect the given word to be corrected
        upon typo introduction, False otherwise.

        This is necessary to ensure we don't introduce typos in words that
        can't be corrected, because if we do, it will be counted as error.

        For now, are considered non-correctable :
         * Words that don't contains any letter (from Unicode standard)

        Args:
            word (str): Word to classify as correctable or not.

        Returns:
            True if the word is correctable (and therefore we can introduce
            typo), False otherwise.
        """
        # Use the Unicode category `L` (see https://en.wikipedia.org/wiki/Unicode_character_property#General_Category)
        return not bool(re.match(r"^[^\pL]+$", word))

    def _get_common_typos(self) -> Dict[str, List[str]]:
        """Retrieve the list (if it exists) of plausible common typos to use
        when introducing typos.

        Returns:
            Dictionary where the keys are the correct words and the values are
                the associated possible typos for this word.
        """
        plang = self.lang.split("-")[0]
        common_typos_cache_file = os.path.join(CACHE_DIR, f"{plang}.json")

        # Try to access the cached common typos, and if it fails, it means we
        # don't have it locally
        try:
            with open(common_typos_cache_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            pass

        # File is not cached, download & process the common typos from online
        os.makedirs(os.path.dirname(common_typos_cache_file), exist_ok=True)
        typos = defaultdict(list)
        if plang == "en":
            response = requests.get(TWEET_TYPO_CORPUS_URL)
            for line in response.text.strip().split("\n"):
                typoed_word, correct_word, *_ = line.split("\t")
                typos[correct_word].append(typoed_word)
        else:
            return {}

        # Save the retrieved typos in cache
        with open(common_typos_cache_file, "w") as f:
            json.dump(typos, f, indent=4)

        return typos

type_till_space(words)

Method introducing typos word by word.

This method receives a list of words, and type these words while introducing typos. So most of the time, only one word will be typed and the method will return. In some cases, the space is mistyped or deleted, so two words are typed.

Parameters:

Name Type Description Default
words List[str]

List of words to type.

required

Returns:

Type Description
List[Optional[Tuple[float, float]]]

List of keystrokes (may contains some None).

str

The typed characters as string.

int

The number of words typed.

List[Typo]

The list of typos introduced in the string typed.

Source code in kebbie/noise_model.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def type_till_space(
    self,
    words: List[str],
) -> Tuple[
    List[Optional[Tuple[float, float]]],
    str,
    int,
    List[Typo],
]:
    """Method introducing typos word by word.

    This method receives a list of words, and type these words while
    introducing typos.
    So most of the time, only one word will be typed and the method will
    return. In some cases, the space is mistyped or deleted, so two words
    are typed.

    Args:
        words (List[str]): List of words to type.

    Returns:
        List of keystrokes (may contains some None).
        The typed characters as string.
        The number of words typed.
        The list of typos introduced in the string typed.
    """
    all_keystrokes = []
    all_typed_char = ""
    all_typos = []

    for i, word in enumerate(words):
        # Some words can't be corrected (numbers, symbols, etc...) -> Don't introduce typos
        error_free = False if self._is_correctable(word) else True

        # Add typos in the word
        noisy_word, typos = self._introduce_typos(word, error_free=error_free)
        all_typos += typos

        # Type the word (fuzzy)
        keystrokes, typed_char, typos = self._fuzzy_type(noisy_word, error_free=error_free)
        all_keystrokes += keystrokes
        all_typed_char += typed_char
        all_typos += typos

        # Then, we try to type a space (separator between words)
        # TODO : Modify this part for languages without space
        noisy_space, sp_typo_1 = self._introduce_typos(SPACE)
        keystrokes, typed_char, sp_typo_2 = self._fuzzy_type(noisy_space)

        # If the space is correctly typed, return now, otherwise type the next word
        if not sp_typo_1 and not sp_typo_2:
            break
        else:
            all_keystrokes += keystrokes
            all_typed_char += typed_char
            all_typos += sp_typo_1 + sp_typo_2

    return all_keystrokes, all_typed_char, i + 1, all_typos

swipe(word)

Method for creating an artificial swipe gesture given a word.

Parameters:

Name Type Description Default
word str

Word to type with a swipe gesture.

required

Returns:

Type Description
Optional[List[Tuple[float, float]]]

Positions (x, y) of the generated swipe gesture, or None if the swipe gesture couldn't be created.

Source code in kebbie/noise_model.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def swipe(self, word: str) -> Optional[List[Tuple[float, float]]]:
    """Method for creating an artificial swipe gesture given a word.

    Args:
        word (str): Word to type with a swipe gesture.

    Returns:
        Positions (x, y) of the generated swipe gesture, or None if the
            swipe gesture couldn't be created.
    """
    # Some words can't be corrected (numbers, symbols, etc...) -> Don't introduce typos
    error_free = False if self._is_correctable(word) else True

    # Get the core keystrokes (fuzzy)
    keystrokes, *_ = self._fuzzy_type(word, error_free=error_free)

    # If we can swipe that word, create the corresponding artificial gesture
    if all(keystrokes) and len(keystrokes) > 1:
        return make_swipe_gesture(keystrokes)
    else:
        return None

_introduce_typos(word, error_free=False)

Method to introduce typos in a given string.

Either the word is changed into an existing common typo, or the word is processed as a stream of characters, each character having a chance of being mistyped. This method only add regular typos (deletions, additions, etc...), and is not introducing fuzzy typing.

Parameters:

Name Type Description Default
word str

Clean string where to add typos.

required
error_free bool

If set to True, don't introduce typo. Defaults to False.

False

Returns:

Type Description
str

The noisy string.

List[Typo]

The list of typos introduced.

Source code in kebbie/noise_model.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
def _introduce_typos(self, word: str, error_free: bool = False) -> Tuple[str, List[Typo]]:  # noqa: C901
    """Method to introduce typos in a given string.

    Either the word is changed into an existing common typo, or the word is
    processed as a stream of characters, each character having a chance of
    being mistyped.
    This method only add regular typos (deletions, additions, etc...), and
    is not introducing fuzzy typing.

    Args:
        word (str): Clean string where to add typos.
        error_free (bool): If set to True, don't introduce typo. Defaults
            to False.

    Returns:
        The noisy string.
        The list of typos introduced.
    """
    if error_free:
        return word, []

    # First of all, we either consider the word as a unit and introduce a
    # language-specific common typo (if available), or treat the word as a
    # sequence of character, where each character can have a typo
    if word in self.common_typos and sample(self.probs[Typo.COMMON_TYPO]):
        # Introduce a common typo
        return random.choice(self.common_typos[word]), [Typo.COMMON_TYPO]

    # From here, treat the word as a stream of characters, and potentially
    # add typos for each character
    noisy_word = ""
    typos = []
    word_chars = list(word)
    for i, char in enumerate(word_chars):
        # First, potentially apply simplifications (removing accent, or
        # lowercasing an uppercase character)
        # Note that if the full word is uppercase, we don't apply lowercase
        # simplification (doesn't feel like a natural typo a user would do)
        if char in self.klayout.letter_accents and sample(self.probs[Typo.SIMPLIFY_ACCENT]):
            char = strip_accents(char)
            typos.append(Typo.SIMPLIFY_ACCENT)
        if char.isupper() and len(word) > 1 and not word.isupper() and sample(self.probs[Typo.SIMPLIFY_CASE]):
            char = char.lower()
            typos.append(Typo.SIMPLIFY_CASE)

        # Check if this character exists on our keyboard
        try:
            *_, klayer_id = self.klayout.get_key_info(char)
            char_is_on_kb = True
            char_is_on_default_kb = klayer_id == 0
        except KeyError:
            char_is_on_kb = char_is_on_default_kb = False

        # Then, add the possible typo depending on the character type
        events = []
        is_first_char = bool(i == 0)
        is_last_char = bool(i >= (len(word_chars) - 1))
        if char.isnumeric() or not char_is_on_kb:
            # Don't introduce typos for numbers or symbols that are not on keyboard
            pass
        else:
            if not is_last_char:
                # Only transpose char if they are on the same keyboard layer
                try:
                    *_, next_char_klayer_id = self.klayout.get_key_info(word[i + 1])
                except KeyError:
                    next_char_klayer_id = None

                if klayer_id == next_char_klayer_id:
                    events.append(Typo.TRANSPOSE_CHAR)
            if char in self.klayout.spelling_symbols:
                events.append(Typo.DELETE_SPELLING_SYMBOL)
                events.append(Typo.ADD_SPELLING_SYMBOL)
            elif char.isspace():
                events.append(Typo.DELETE_SPACE)
                events.append(Typo.ADD_SPACE)
            elif char in string.punctuation:
                events.append(Typo.DELETE_PUNCTUATION)
                events.append(Typo.ADD_PUNCTUATION)
            elif char_is_on_default_kb:
                events.append(Typo.DELETE_CHAR)
                events.append(Typo.ADD_CHAR)

        # If it's the last character (and we are not typing a space),
        # don't add deletions typos, because it's an auto-completion case,
        # not auto-correction
        if is_last_char and word != SPACE:
            events = [e for e in events if e not in DELETIONS]

        # Get the probabilities for these possible events
        typo_probs = {e: self.probs[e] for e in events}
        if is_first_char:
            # Deleting the first character of the word is not so common, update the probabilities accordingly
            typo_probs = {e: p * FRONT_DELETION_MULTIPLIER if e in DELETIONS else p for e, p in typo_probs.items()}

        # And sample one of them
        typo = sample_among(typo_probs)

        # Process the typo
        if typo is Typo.TRANSPOSE_CHAR:
            noisy_char = word_chars[i + 1]
            word_chars[i + 1] = char
        elif typo in [Typo.DELETE_SPELLING_SYMBOL, Typo.DELETE_SPACE, Typo.DELETE_PUNCTUATION, Typo.DELETE_CHAR]:
            noisy_char = ""
        elif typo in [Typo.ADD_SPELLING_SYMBOL, Typo.ADD_SPACE, Typo.ADD_PUNCTUATION, Typo.ADD_CHAR]:
            noisy_char = f"{char}{char}"
        else:  # No typo
            noisy_char = char

        noisy_word += noisy_char
        if typo is not None:
            typos.append(typo)

    return noisy_word, typos

_fuzzy_type(word, error_free=False)

Method adding fuzzy typing.

This method takes a string (potentially already noisy from other type of typos), and fuzzy-type it : simulate a user on a soft-keyboard. This "fat-finger syndrom" is simulated using two Gaussian distributions, one for each axis (x, y). This method also returns the generated keystrokes (positions on the keyboard), but only for the default keyboard (ID = 0). Keystrokes from other keyboard are set to None.

Parameters:

Name Type Description Default
word str

String to fuzzy-type.

required
error_free bool

If set to True, don't introduce typo. Defaults to False.

False

Returns:

Type Description
List[Optional[Tuple[float, float]]]

List of keystrokes.

str

Fuzzy string (corresponding to the keystrokes).

List[Typo]

List of typos introduced.

Source code in kebbie/noise_model.py
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
def _fuzzy_type(
    self, word: str, error_free: bool = False
) -> Tuple[List[Optional[Tuple[float, float]]], str, List[Typo]]:
    """Method adding fuzzy typing.

    This method takes a string (potentially already noisy from other type
    of typos), and fuzzy-type it : simulate a user on a soft-keyboard.
    This "fat-finger syndrom" is simulated using two Gaussian
    distributions, one for each axis (x, y).
    This method also returns the generated keystrokes (positions on the
    keyboard), but only for the default keyboard (ID = 0). Keystrokes from
    other keyboard are set to None.

    Args:
        word (str): String to fuzzy-type.
        error_free (bool): If set to True, don't introduce typo. Defaults
            to False.

    Returns:
        List of keystrokes.
        Fuzzy string (corresponding to the keystrokes).
        List of typos introduced.
    """
    fuzzy_word = ""
    keystrokes = []
    typos = []

    # Type word character by character
    for char in word:
        try:
            width, height, x_center, y_center, klayer_id = self.klayout.get_key_info(char)
        except KeyError:
            # This character doesn't exist on the current keyboard
            # Just type it without introducing typo, like if the user copy-pasted it
            keystrokes.append(None)
            fuzzy_word += char
            continue

        # Sample a keystroke for this character
        # Note that we don't generate typos for characters outside of the default keyboard
        if error_free or klayer_id != 0:
            keystroke = (x_center, y_center)
        else:
            # Compute mu and sigma for the Normal distribution
            x_mu = x_center + self.x_offset
            y_mu = y_center + self.y_offset
            x_sigma = (width / 2) / self.x_ratio
            y_sigma = (height / 2) / self.y_ratio

            # Sample a position (x and y)
            keystroke = (random.gauss(x_mu, x_sigma), random.gauss(y_mu, y_sigma))

        # Convert it back to a character, to see where we tapped
        fuzzy_char = self.klayout.get_key(keystroke, klayer_id)

        # Save it (save the keystroke only if part of the default keyboard)
        keystrokes.append(keystroke if klayer_id == 0 else None)
        fuzzy_word += fuzzy_char
        if fuzzy_char != char:
            typos.append(Typo.SUBSTITUTE_CHAR)

    return keystrokes, fuzzy_word, typos

_is_correctable(word)

Method returning True if we expect the given word to be corrected upon typo introduction, False otherwise.

This is necessary to ensure we don't introduce typos in words that can't be corrected, because if we do, it will be counted as error.

For now, are considered non-correctable : * Words that don't contains any letter (from Unicode standard)

Parameters:

Name Type Description Default
word str

Word to classify as correctable or not.

required

Returns:

Type Description
bool

True if the word is correctable (and therefore we can introduce

bool

typo), False otherwise.

Source code in kebbie/noise_model.py
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def _is_correctable(self, word: str) -> bool:
    """Method returning True if we expect the given word to be corrected
    upon typo introduction, False otherwise.

    This is necessary to ensure we don't introduce typos in words that
    can't be corrected, because if we do, it will be counted as error.

    For now, are considered non-correctable :
     * Words that don't contains any letter (from Unicode standard)

    Args:
        word (str): Word to classify as correctable or not.

    Returns:
        True if the word is correctable (and therefore we can introduce
        typo), False otherwise.
    """
    # Use the Unicode category `L` (see https://en.wikipedia.org/wiki/Unicode_character_property#General_Category)
    return not bool(re.match(r"^[^\pL]+$", word))

_get_common_typos()

Retrieve the list (if it exists) of plausible common typos to use when introducing typos.

Returns:

Type Description
Dict[str, List[str]]

Dictionary where the keys are the correct words and the values are the associated possible typos for this word.

Source code in kebbie/noise_model.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
def _get_common_typos(self) -> Dict[str, List[str]]:
    """Retrieve the list (if it exists) of plausible common typos to use
    when introducing typos.

    Returns:
        Dictionary where the keys are the correct words and the values are
            the associated possible typos for this word.
    """
    plang = self.lang.split("-")[0]
    common_typos_cache_file = os.path.join(CACHE_DIR, f"{plang}.json")

    # Try to access the cached common typos, and if it fails, it means we
    # don't have it locally
    try:
        with open(common_typos_cache_file, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        pass

    # File is not cached, download & process the common typos from online
    os.makedirs(os.path.dirname(common_typos_cache_file), exist_ok=True)
    typos = defaultdict(list)
    if plang == "en":
        response = requests.get(TWEET_TYPO_CORPUS_URL)
        for line in response.text.strip().split("\n"):
            typoed_word, correct_word, *_ = line.split("\t")
            typos[correct_word].append(typoed_word)
    else:
        return {}

    # Save the retrieved typos in cache
    with open(common_typos_cache_file, "w") as f:
        json.dump(typos, f, indent=4)

    return typos

oracle.py

Module defining the Oracle class, which is the class taking care of iterating the dataset, introducing typos using the noise model, and querying the Corrector to correct these typos. Then the scorer is used to compute metrics about the performances, and the results are returned.

Oracle

Class that takes care of testing a Corrector. It basically gets clean text data, adds noise to it, send the noisy data to the Corrector, and scores its output.

This class spawn multiple processes to decrease runtime.

Parameters:

Name Type Description Default
lang str

Language used.

required
test_data Dict[str, List[str]]

List of clean sentences for each domain.

required
custom_keyboard Dict

If provided, instead of relying on the keyboard layout provided by default, uses the given keyboard layout.

required
track_mistakes bool

Set to True for tracking the most common mistakes. Most common mistakes are added to the results dictionary.

required
n_most_common_mistakes int

If track_mistakes is set to True, the top X mistakes to record.

required
beta float

Beta to use for computing the F-beta score.

required
Source code in kebbie/oracle.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class Oracle:
    """Class that takes care of testing a Corrector. It basically gets clean
    text data, adds noise to it, send the noisy data to the Corrector, and
    scores its output.

    This class spawn multiple processes to decrease runtime.

    Args:
        lang (str): Language used.
        test_data (Dict[str, List[str]]): List of clean sentences for each
            domain.
        custom_keyboard (Dict): If provided, instead of relying on
            the keyboard layout provided by default, uses the given keyboard
            layout.
        track_mistakes (bool): Set to `True` for tracking the most
            common mistakes. Most common mistakes are added to the results
            dictionary.
        n_most_common_mistakes (int): If `track_mistakes` is set to
            `True`, the top X mistakes to record.
        beta (float): Beta to use for computing the F-beta score.
    """

    def __init__(
        self,
        lang: str,
        test_data: Dict[str, List[str]],
        custom_keyboard: Dict,
        track_mistakes: bool,
        n_most_common_mistakes: int,
        beta: float,
    ) -> None:
        super().__init__()

        self.lang = lang
        self.data = test_data
        self.custom_keyboard = custom_keyboard
        self.track_mistakes = track_mistakes
        self.n_most_common_mistakes = n_most_common_mistakes
        self.beta = beta

    def test(self, corrector: Union[Corrector, List[Corrector]], n_proc: Optional[int], seed: int) -> Dict:
        """Main method, it tests the given Corrector, and returns results as a
        dictionary.

        This method spawn multiple processes to decrease runtime.

        Args:
            corrector (Union[Corrector, List[Corrector]]): Corrector to test.
                If a list of Corrector is given, the argument `n_proc` is
                ignored, and one corrector is assigned for each process.
            n_proc (Optional[int]): Number of processes to use. If `None`,
                `os.cpu_count()` is used.
            seed (int): Seed to use for running the tests.

        Returns:
            Results formatted in a dictionary.
        """
        # Initialize a global Scorer here, that will gather counts across processes
        scorer = Scorer(domains=self.data.keys(), track_mistakes=self.track_mistakes)

        # For multiprocessing
        n_proc = n_proc if n_proc is not None else os.cpu_count()
        d_size = sum(len(d) for d in self.data.values())

        # Create the corrector for each process
        proc_correctors = mp.Queue()
        if isinstance(corrector, Corrector):
            for _ in range(n_proc):
                proc_correctors.put(corrector)
        else:
            # If we already have a list of correctors, assign one for each process
            n_proc = len(corrector)
            for c in corrector:
                proc_correctors.put(c)

        with mp.Pool(
            processes=n_proc,
            initializer=init_tester,
            initargs=(tester, self.lang, self.custom_keyboard, proc_correctors, seed, self.track_mistakes),
        ) as pool, tqdm(total=d_size) as pbar:
            # Test data is made of several domain, where each domain contains a list of sentences
            for domain, sentence_list in self.data.items():
                chunk_size = max(min(CHUNK_SIZE, len(sentence_list) // n_proc), 1)
                for scr in pool.imap_unordered(tester, sentence_list, chunksize=chunk_size):
                    scr.set_domain(domain)
                    scorer.add(scr)
                    pbar.update(1)

        # Retrieve the results
        results = scorer.score(beta=self.beta)

        # Then potentially add the most common mistakes
        if self.track_mistakes:
            mistakes = {}
            for task in ["nwp", "acp", "acr"]:
                task_name = {"nwp": "next_word_prediction", "acp": "auto_completion", "acr": "auto_correction"}[task]

                m_count = getattr(scorer, f"{task}_mistakes")

                mistakes[task_name] = [("Count", "Expected", "Predictions", "Context")]
                for m, c in m_count.most_common(self.n_most_common_mistakes):
                    mistakes[task_name].append((c, m.actual, f"[{', '.join(m.preds)}]", m.context))

            results["most_common_mistakes"] = mistakes

        return results

test(corrector, n_proc, seed)

Main method, it tests the given Corrector, and returns results as a dictionary.

This method spawn multiple processes to decrease runtime.

Parameters:

Name Type Description Default
corrector Union[Corrector, List[Corrector]]

Corrector to test. If a list of Corrector is given, the argument n_proc is ignored, and one corrector is assigned for each process.

required
n_proc Optional[int]

Number of processes to use. If None, os.cpu_count() is used.

required
seed int

Seed to use for running the tests.

required

Returns:

Type Description
Dict

Results formatted in a dictionary.

Source code in kebbie/oracle.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def test(self, corrector: Union[Corrector, List[Corrector]], n_proc: Optional[int], seed: int) -> Dict:
    """Main method, it tests the given Corrector, and returns results as a
    dictionary.

    This method spawn multiple processes to decrease runtime.

    Args:
        corrector (Union[Corrector, List[Corrector]]): Corrector to test.
            If a list of Corrector is given, the argument `n_proc` is
            ignored, and one corrector is assigned for each process.
        n_proc (Optional[int]): Number of processes to use. If `None`,
            `os.cpu_count()` is used.
        seed (int): Seed to use for running the tests.

    Returns:
        Results formatted in a dictionary.
    """
    # Initialize a global Scorer here, that will gather counts across processes
    scorer = Scorer(domains=self.data.keys(), track_mistakes=self.track_mistakes)

    # For multiprocessing
    n_proc = n_proc if n_proc is not None else os.cpu_count()
    d_size = sum(len(d) for d in self.data.values())

    # Create the corrector for each process
    proc_correctors = mp.Queue()
    if isinstance(corrector, Corrector):
        for _ in range(n_proc):
            proc_correctors.put(corrector)
    else:
        # If we already have a list of correctors, assign one for each process
        n_proc = len(corrector)
        for c in corrector:
            proc_correctors.put(c)

    with mp.Pool(
        processes=n_proc,
        initializer=init_tester,
        initargs=(tester, self.lang, self.custom_keyboard, proc_correctors, seed, self.track_mistakes),
    ) as pool, tqdm(total=d_size) as pbar:
        # Test data is made of several domain, where each domain contains a list of sentences
        for domain, sentence_list in self.data.items():
            chunk_size = max(min(CHUNK_SIZE, len(sentence_list) // n_proc), 1)
            for scr in pool.imap_unordered(tester, sentence_list, chunksize=chunk_size):
                scr.set_domain(domain)
                scorer.add(scr)
                pbar.update(1)

    # Retrieve the results
    results = scorer.score(beta=self.beta)

    # Then potentially add the most common mistakes
    if self.track_mistakes:
        mistakes = {}
        for task in ["nwp", "acp", "acr"]:
            task_name = {"nwp": "next_word_prediction", "acp": "auto_completion", "acr": "auto_correction"}[task]

            m_count = getattr(scorer, f"{task}_mistakes")

            mistakes[task_name] = [("Count", "Expected", "Predictions", "Context")]
            for m, c in m_count.most_common(self.n_most_common_mistakes):
                mistakes[task_name].append((c, m.actual, f"[{', '.join(m.preds)}]", m.context))

        results["most_common_mistakes"] = mistakes

    return results

init_tester(fn, lang, custom_keyboard, correctors, seed, track_mistakes)

Function run at process initialization for Tester workers.

Each worker in a Pool will run this function when created. It will instanciate several things needed for testing the given corrector : * A Tokenizer to split sentences into words * A NoiseModel to introduce typos * A Corrector instance, which is the model we want to test

Parameters:

Name Type Description Default
fn Callable

Main tester function (instanciated objects will be attached to this function).

required
lang str

Language used.

required
custom_keyboard Dict

If provided, instead of relying on the keyboard layout provided by default, uses the given keyboard layout.

required
correctors Queue

Queue containing list of correctors to test. Each process will get the next corrector available in queue.

required
seed int

Base seed to use.

required
track_mistakes bool

Set to True for tracking the most common mistakes.

required
Source code in kebbie/oracle.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def init_tester(
    fn: Callable, lang: str, custom_keyboard: Dict, correctors: mp.Queue, seed: int, track_mistakes: bool
) -> None:
    """Function run at process initialization for Tester workers.

    Each worker in a Pool will run this function when created. It will
    instanciate several things needed for testing the given corrector :
     * A Tokenizer to split sentences into words
     * A NoiseModel to introduce typos
     * A Corrector instance, which is the model we want to test

    Args:
        fn (Callable): Main tester function (instanciated objects will be
            attached to this function).
        lang (str): Language used.
        custom_keyboard (Dict, optional): If provided, instead of relying on
            the keyboard layout provided by default, uses the given keyboard
            layout.
        correctors (mp.Queue): Queue containing list of correctors to test.
            Each process will get the next corrector available in queue.
        seed (int): Base seed to use.
        track_mistakes (bool): Set to `True` for tracking the most common
            mistakes.
    """
    fn.tokenizer = BasicTokenizer()
    fn.noisy = NoiseModel(lang, custom_keyboard=custom_keyboard)
    fn.corrector = correctors.get()
    fn.base_seed = seed
    fn.track_mistakes = track_mistakes

tester(sentence)

Function to test a given sentence.

It uses the noise model to introduce typos word by word, run the Corrector on various tasks (auto-completion, auto-correction, next-word prediction), and score the results.

Parameters:

Name Type Description Default
sentence str

Sentence to use as data for the test.

required

Returns:

Type Description
Scorer

Scorer class with the prediction counts for this sentence.

Source code in kebbie/oracle.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def tester(sentence: str) -> Scorer:
    """Function to test a given sentence.

    It uses the noise model to introduce typos word by word, run the
    Corrector on various tasks (auto-completion, auto-correction, next-word
    prediction), and score the results.

    Args:
        sentence (str): Sentence to use as data for the test.

    Returns:
        Scorer class with the prediction counts for this sentence.
    """
    # Set the seed for reproducibility, using the hash of the sentence
    hsh = int(hashlib.sha256(sentence.encode("utf-8")).hexdigest(), 16)
    random.seed(tester.base_seed + hsh)
    rnd_state = random.getstate()

    # Tokenize the sentence into words
    sentence = tester.tokenizer.preprocess(sentence)
    words = tester.tokenizer.word_split(sentence)

    context = ""
    # Keep track for predictions counts with a local scorer, for this sentence
    scorer = Scorer(domains=[None], track_mistakes=tester.track_mistakes)
    while words and len(context) < MAX_CHAR_PER_SENTENCE:
        # Before randomly generating typo, set the random state for determinism
        random.setstate(rnd_state)

        # It's slow to generate swipe gesture every sentence, so run it just sometimes
        word_to_swipe = words[0]
        swipe_gesture = tester.noisy.swipe(word_to_swipe) if sample(SWIPE_PROB) else None

        # Generate noisy keystrokes for the next word(s)
        keystrokes, typed_word, n_word_typed, typos = tester.noisy.type_till_space(words)

        # Get the clean word(s), update the remaining words to type and get the next word
        actual_word = " ".join(words[:n_word_typed])
        words = words[n_word_typed:]
        next_word = words[0] if len(words) > 0 else None

        # We are done with generating typo, save the random state for the next iteration
        rnd_state = random.getstate()

        if swipe_gesture:
            # Call the swipe model
            preds, memory, runtime = tester.corrector.profiled_resolve_swipe(context, swipe_gesture)
            scorer.swp(word_to_swipe, preds, context=context, memory=memory, runtime=runtime)

        # Call the model for auto-completion (for long enough words)
        if len(typed_word) > 1 and len(actual_word) > 1:
            partial_keystrokes, partial_word = sample_partial_word(keystrokes, typed_word, actual_word)
            preds, memory, runtime = tester.corrector.profiled_auto_complete(context, partial_keystrokes, partial_word)
            scorer.acp(actual_word, preds, partial_word=partial_word, context=context, memory=memory, runtime=runtime)

        # Call the model for auto-correction
        preds, memory, runtime = tester.corrector.profiled_auto_correct(context, keystrokes, typed_word)
        scorer.acr(
            actual_word, preds, typed_word=typed_word, context=context, typos=typos, memory=memory, runtime=runtime
        )

        # Update the context for the next iteration (input forcing)
        context = tester.tokenizer.update_context(context, actual_word)

        # Call the model for next-word prediction
        if next_word:
            preds, memory, runtime = tester.corrector.profiled_predict_next_word(context)
            scorer.nwp(next_word, preds, context=context, memory=memory, runtime=runtime)

    return scorer

scorer.py

Module implementing Scorer, a class that keep track of how many errors the model is making, and output various corresponding metrics.

Count dataclass

Structure representing the most basic counts for a task.

It counts : * Number of correct predictions * Number of top3-correct predictions * Total number of predictions

Source code in kebbie/scorer.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@dataclass
class Count:
    """Structure representing the most basic counts for a task.

    It counts :
    * Number of correct predictions
    * Number of top3-correct predictions
    * Total number of predictions
    """

    correct: int = 0  # Number of times the first prediction was correct
    correct_3: int = 0  # Number of times one of the top-3 predictions was correct
    total: int = 0  # Total number of predictions

    def __add__(self, count: Count) -> Count:
        """Merge two `Count` instance by adding their counts.

        Args:
            count (Count): Count instance to add.

        Returns:
            Merged Count.
        """
        return Count(
            correct=self.correct + count.correct,
            correct_3=self.correct_3 + count.correct_3,
            total=self.total + count.total,
        )

    def __mul__(self, proportion: float) -> Count:
        """Multiply the current `Count` instance by a given proportion.

        Args:
            proportion (float): Proportion to multiply by.

        Returns:
            Count with the right proportion.
        """
        return Count(
            correct=round(self.correct * proportion),
            correct_3=round(self.correct_3 * proportion),
            total=round(self.total * proportion),
        )

__add__(count)

Merge two Count instance by adding their counts.

Parameters:

Name Type Description Default
count Count

Count instance to add.

required

Returns:

Type Description
Count

Merged Count.

Source code in kebbie/scorer.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __add__(self, count: Count) -> Count:
    """Merge two `Count` instance by adding their counts.

    Args:
        count (Count): Count instance to add.

    Returns:
        Merged Count.
    """
    return Count(
        correct=self.correct + count.correct,
        correct_3=self.correct_3 + count.correct_3,
        total=self.total + count.total,
    )

__mul__(proportion)

Multiply the current Count instance by a given proportion.

Parameters:

Name Type Description Default
proportion float

Proportion to multiply by.

required

Returns:

Type Description
Count

Count with the right proportion.

Source code in kebbie/scorer.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def __mul__(self, proportion: float) -> Count:
    """Multiply the current `Count` instance by a given proportion.

    Args:
        proportion (float): Proportion to multiply by.

    Returns:
        Count with the right proportion.
    """
    return Count(
        correct=round(self.correct * proportion),
        correct_3=round(self.correct_3 * proportion),
        total=round(self.total * proportion),
    )

Mistake dataclass

Structure representing a mistake (including the context of the mistake, the expected word and the predictions).

Source code in kebbie/scorer.py
67
68
69
70
71
72
73
74
75
@dataclass(eq=True, frozen=True)
class Mistake:
    """Structure representing a mistake (including the context of the mistake,
    the expected word and the predictions).
    """

    actual: str = field(compare=True)
    preds: List[str] = field(compare=False)
    context: str = field(compare=False)

Scorer

Class keeping track of the predictions and how correct they are, but also computing the associated score for each task after the end of test.

Parameters:

Name Type Description Default
domains List[str]

The list of domains in the dataset. The Scorer keeps track of the score for each domain, so that we can spot discrepancies between domain, if any.

required
human_readable bool

If set to False, performance metrics (memory, runtime) are kept in their raw, numeral form. If set to True, these are converted to a human readable string.

True
track_mistakes bool

Set to True for tracking the most common mistakes.

False
Source code in kebbie/scorer.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
class Scorer:
    """Class keeping track of the predictions and how correct they are, but
    also computing the associated score for each task after the end of test.

    Args:
        domains (List[str]): The list of domains in the dataset. The Scorer
            keeps track of the score for each domain, so that we can spot
            discrepancies between domain, if any.
        human_readable (bool, optional): If set to `False`, performance metrics
            (memory, runtime) are kept in their raw, numeral form. If set to
            `True`, these are converted to a human readable string.
        track_mistakes (bool, optional): Set to `True` for tracking the most
            common mistakes.
    """

    def __init__(self, domains: List[str], human_readable: bool = True, track_mistakes: bool = False) -> None:
        self.human_readable = human_readable

        # For each task, create a dictionary of Counts
        # Each task has a different structure :

        # Next-word prediction : [domain] -> counts
        self.nwp_c = dd_x_layers(1)

        # Autocompletion : [domain] -> [typo/no_typo] -> [word_completion_rate] -> counts
        self.acp_c = dd_x_layers(3)

        # Autocorrection : [domain] -> [typo type/number of typo] -> counts
        self.acr_c = dd_x_layers(2)

        # Swipe resolution : [domain] -> counts
        self.swp_c = dd_x_layers(1)

        # Make sure we track each domain (create a 0-Count for each domain)
        for d in domains:
            _ = self.nwp_c[d], self.acp_c[d][WITH_TYPO][0], self.acr_c[d][None], self.swp_c[d]

        # Also keep track of memories & runtimes
        self.nwp_memories = []
        self.acp_memories = []
        self.acr_memories = []
        self.swp_memories = []
        self.nwp_runtimes = []
        self.acp_runtimes = []
        self.acr_runtimes = []
        self.swp_runtimes = []

        # Optionally track common mistakes
        self.track_mistakes = track_mistakes
        self.nwp_mistakes = Counter()
        self.acp_mistakes = Counter()
        self.acr_mistakes = Counter()
        self.swp_mistakes = Counter()

    def add(self, scorer) -> None:
        """Method to update the current Scorer with the counts from another
        Scorer.

        Args:
            scorer (Scorer): Scorer to add.
        """

        def update(d1, d2):
            for k in d2:
                if isinstance(d2[k], Count):
                    d1[k] += d2[k]
                else:
                    update(d1[k], d2[k])

        update(self.nwp_c, scorer.nwp_c)
        update(self.acp_c, scorer.acp_c)
        update(self.acr_c, scorer.acr_c)
        update(self.swp_c, scorer.swp_c)
        self.nwp_memories.extend(scorer.nwp_memories)
        self.acp_memories.extend(scorer.acp_memories)
        self.acr_memories.extend(scorer.acr_memories)
        self.swp_memories.extend(scorer.swp_memories)
        self.nwp_runtimes.extend(scorer.nwp_runtimes)
        self.acp_runtimes.extend(scorer.acp_runtimes)
        self.acr_runtimes.extend(scorer.acr_runtimes)
        self.swp_runtimes.extend(scorer.swp_runtimes)
        self.nwp_mistakes.update(scorer.nwp_mistakes)
        self.acp_mistakes.update(scorer.acp_mistakes)
        self.acr_mistakes.update(scorer.acr_mistakes)
        self.swp_mistakes.update(scorer.swp_mistakes)

    def nwp(
        self,
        true_word: str,
        predicted_words: List[str],
        context: str,
        memory: int,
        runtime: int,
        domain: Optional[str] = None,
    ) -> None:
        """Method used to record a prediction for the next-word prediction
        task.

        Args:
            true_word (str): The label (clean word to predict).
            predicted_words (List[str]): Predictions of the model.
            context (str): The context (previous words in the sentence).
            memory (int): Memory consumption for the call of the model.
            runtime (int): Runtime for the call of the model.
            domain (str): Domain of this prediction.
        """
        # Record memory & runtime
        if memory >= 0:
            self.nwp_memories.append(memory)
        if runtime >= 0:
            self.nwp_runtimes.append(runtime)

        # Record counts
        if len(predicted_words) > 0 and predicted_words[0] == true_word:
            self.nwp_c[domain].correct += 1
        if true_word in predicted_words[:3]:
            self.nwp_c[domain].correct_3 += 1
        else:
            # If the word is not in the top-3 predictions, this is a mistake
            if self.track_mistakes:
                self.nwp_mistakes.update([Mistake(actual=true_word, preds=predicted_words[:3], context=context)])

        self.nwp_c[domain].total += 1

    def acp(
        self,
        true_word: str,
        predicted_words: List[str],
        partial_word: str,
        context: str,
        memory: int,
        runtime: int,
        domain: Optional[str] = None,
    ) -> None:
        """Method used to record a prediction for the auto-completion task.

        Args:
            true_word (str): The label (clean word to predict).
            predicted_words (List[str]): Predictions of the model.
            partial_word (str): The input sent to the model (only part of the
                word to predict, with potential typos).
            context (str): The context (previous words in the sentence).
            memory (int): Memory consumption for the call of the model.
            runtime (int): Runtime for the call of the model.
            domain (str): Domain of this prediction.
        """
        # Record memory & runtime
        if memory >= 0:
            self.acp_memories.append(memory)
        if runtime >= 0:
            self.acp_runtimes.append(runtime)

        # Check if a typo was introduced or not
        has_typo = WITHOUT_TYPO if true_word.startswith(partial_word) else WITH_TYPO

        # Compute the completion rate
        completion_rate = round(len(partial_word) / len(true_word), 2)

        # Record counts
        if len(predicted_words) > 0 and predicted_words[0] == true_word:
            self.acp_c[domain][has_typo][completion_rate].correct += 1
        if true_word in predicted_words[:3]:
            self.acp_c[domain][has_typo][completion_rate].correct_3 += 1
        else:
            # If the word is not in the top-3 predictions, this is a mistake
            if self.track_mistakes:
                self.acp_mistakes.update(
                    [Mistake(actual=true_word, preds=predicted_words[:3], context=f"{context}{partial_word}")]
                )

        self.acp_c[domain][has_typo][completion_rate].total += 1

    def acr(
        self,
        true_word: str,
        predicted_words: List[str],
        typed_word: str,
        context: str,
        typos: List[Typo],
        memory: int,
        runtime: int,
        domain: Optional[str] = None,
    ) -> None:
        """Method used to record a prediction for the auto-correction task.

        Args:
            true_word (str): The label (clean word to predict).
            predicted_words (List[str]): Predictions of the model.
            typed_word (str): The word typed, containing potential typos.
            context (str): The context (previous words in the sentence).
            typos (List[Typo]): List of typos introduced.
            memory (int): Memory consumption for the call of the model.
            runtime (int): Runtime for the call of the model.
            domain (str): Domain of this prediction.
        """
        # Record memory & runtime
        if memory >= 0:
            self.acr_memories.append(memory)
        if runtime >= 0:
            self.acr_runtimes.append(runtime)

        # Get the type of typo
        if not typos:
            typo_type = None
        elif len(typos) == 1:
            typo_type = typos[0]
        else:
            typo_type = len(typos)

        # Record counts
        if len(predicted_words) > 0 and predicted_words[0] == true_word:
            self.acr_c[domain][typo_type].correct += 1
        if true_word in predicted_words[:3]:
            self.acr_c[domain][typo_type].correct_3 += 1
        else:
            # If the word is not in the top-3 predictions, this is a mistake
            if self.track_mistakes:
                self.acr_mistakes.update(
                    [Mistake(actual=true_word, preds=predicted_words[:3], context=f"{context}{typed_word}")]
                )

        self.acr_c[domain][typo_type].total += 1

    def swp(
        self,
        true_word: str,
        predicted_words: List[str],
        context: str,
        memory: int,
        runtime: int,
        domain: Optional[str] = None,
    ) -> None:
        """Method used to record a prediction for the swipe resolution task.

        Args:
            true_word (str): The label (clean word to predict).
            predicted_words (List[str]): Predictions of the model.
            context (str): The context (previous words in the sentence).
            memory (int): Memory consumption for the call of the model.
            runtime (int): Runtime for the call of the model.
            domain (str): Domain of this prediction.
        """
        # Record memory & runtime
        if memory >= 0:
            self.swp_memories.append(memory)
        if runtime >= 0:
            self.swp_runtimes.append(runtime)

        # Record counts
        if len(predicted_words) > 0 and predicted_words[0] == true_word:
            self.swp_c[domain].correct += 1
        if true_word in predicted_words[:3]:
            self.swp_c[domain].correct_3 += 1
        else:
            # If the word is not in the top-3 predictions, this is a mistake
            if self.track_mistakes:
                self.swp_mistakes.update([Mistake(actual=true_word, preds=predicted_words[:3], context=context)])

        self.swp_c[domain].total += 1

    def set_domain(self, domain: str) -> None:
        """Method setting the domain for the scores associated with no domain.

        To make it easier to score a single sentence, it's possible to call the
        scorer without a domain (see signature of `nwp()`, `acp()`, `acr()`).
        In this case the scores are associated to no domain (`None` key).
        This method allows the user to set the domain name for these scores
        with no domain (effectively moving the `None` domain scores to the
        given domain name).

        Note:
            If some scores were already linked to the given domain, these
            scores will be erased (replaced by the scores of the `None`
            domain).

        Args:
            domain (str): Domain name to associate the scores to.
        """
        if None in self.nwp_c:
            self.nwp_c[domain] = self.nwp_c.pop(None)
        if None in self.acp_c:
            self.acp_c[domain] = self.acp_c.pop(None)
        if None in self.acr_c:
            self.acr_c[domain] = self.acr_c.pop(None)
        if None in self.swp_c:
            self.swp_c[domain] = self.swp_c.pop(None)

    def _score_accuracy(self, c: Count) -> Dict:
        """Helper method to compute the accuracy given a prediction count.

        This method return a dictionary with 3 metrics :
         * Accuracy
         * Top3 accuracy
         * Total number of predictions

        Args:
            c (Count): Count object to use to compute the accuracy.

        Returns:
            Dictionary with the computed metrics.
        """
        return {
            "accuracy": round_to_n(c.correct / c.total) if c.total != 0 else 0,
            "top3_accuracy": round_to_n(c.correct_3 / c.total) if c.total != 0 else 0,
            "n": c.total,
        }

    def _score_precision_recall(self, no_typo_c: Count, typo_c: Count, beta: float) -> Dict:
        """Helper method to compute the precision and recall for
        auto-correction.

        This method return a dictionary with several metrics :
         * Accuracy
         * Precision
         * Recall
         * F-score
         * Top3 accuracy
         * Top3 precision
         * Top3 recall
         * Top3 F-score
         * Number of predictions with a typo
         * Total number of predictions

        For auto-correction, we need 2 Count objects : the counts of typos, and
        the counts of non-typo (to compute the True Negative and False Positive
        metrics).

        Args:
            no_typo_c (Count): Count object for the predictions where no typo
                were added.
            typo_c (Count): Count object for the predictions where typos were
                added.
            beta (float): Beta to use for computing the F-beta score.

        Returns:
            Dictionary with the computed metrics.
        """
        # The first step is to divide the counts into TN, FP, TP, FN
        tn = no_typo_c.correct
        fp = no_typo_c.total - no_typo_c.correct
        tp = typo_c.correct
        fn = typo_c.total - typo_c.correct

        tn_3 = no_typo_c.correct_3
        fp_3 = no_typo_c.total - no_typo_c.correct_3
        tp_3 = typo_c.correct_3
        fn_3 = typo_c.total - typo_c.correct_3

        # Then we compute the metrics
        p = precision(tp=tp, fp=fp)
        r = recall(tp=tp, fn=fn)

        p_3 = precision(tp=tp_3, fp=fp_3)
        r_3 = recall(tp=tp_3, fn=fn_3)

        return {
            "accuracy": round_to_n(accuracy(tp=tp, tn=tn, fp=fp, fn=fn)),
            "precision": round_to_n(p),
            "recall": round_to_n(r),
            "fscore": round_to_n(fbeta(precision=p, recall=r, beta=beta)),
            "top3_accuracy": round_to_n(accuracy(tp=tp_3, tn=tn_3, fp=fp_3, fn=fn_3)),
            "top3_precision": round_to_n(p_3),
            "top3_recall": round_to_n(r_3),
            "top3_fscore": round_to_n(fbeta(precision=p_3, recall=r_3, beta=beta)),
            "n_typo": typo_c.total,
            "n": no_typo_c.total + typo_c.total,
        }

    def _score_performances(self, memories: List[int], runtimes: List[int]) -> Dict:
        """Helper method to compute metrics related to the memory & runtime.

        This method returns a dictionary with several metrics :
         * The mean memory consumption
         * The min memory consumption
         * The max memory consumption
         * The mean running time
         * The fastest running time
         * The slowest running time

        Args:
            memories (List[int]): List of memories consumptions for a
                specific operation.
            runtimes (List[int]): List of runtimes for a specific operation.

        Returns:
            Dictionary with the computed metrics.
        """
        perf = {
            "mean_memory": stats.mean(memories) if memories else 0,
            "min_memory": min(memories) if memories else 0,
            "max_memory": max(memories) if memories else 0,
            "mean_runtime": stats.mean(runtimes) if runtimes else 0,
            "fastest_runtime": min(runtimes) if runtimes else 0,
            "slowest_runtime": max(runtimes) if runtimes else 0,
        }

        if self.human_readable:
            perf = {
                name: human_readable_memory(x) if name.endswith("memory") else human_readable_runtime(x)
                for name, x in perf.items()
            }

        return perf

    def score(self, beta: float = DEFAULT_BETA) -> Dict:  # noqa: C901
        """Method that computes the final scores (as well as some alternative
        metrics that can bring insight in the capabilities of the model), and
        output these in an organized dictionary.

        Args:
            beta (float, optional): Beta to use for computing the F-beta score.

        Returns:
            Dictionary containing the computed scores and metrics for the
            model tested.
        """
        # --- Next-word prediction ---
        # Group scores by domain
        per = defaultdict(Count)
        for domain, c in self.nwp_c.items():
            per[domain] += c
        total_c = sum(per.values(), Count())
        per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

        # Task results
        nwp = {
            "score": self._score_accuracy(total_c),
            "per_domain": per_domain,
            "performances": self._score_performances(self.nwp_memories, self.nwp_runtimes),
        }

        # --- Auto-completion ---
        # Group scores by domain
        per = defaultdict(Count)
        for domain, d1 in self.acp_c.items():
            for has_typo, d2 in d1.items():
                for compl_rate, c in d2.items():
                    per[domain] += c
        total_c = sum(per.values(), Count())
        per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

        # Group scores by completion rate
        per = defaultdict(Count)
        for domain, d1 in self.acp_c.items():
            for has_typo, d2 in d1.items():
                for compl_rate, c in d2.items():
                    per[compl_rate] += c
        per_compl_rate = {
            "<25%": self._score_accuracy(sum((c for k, c in per.items() if k < 0.25), Count())),
            "25%~50%": self._score_accuracy(sum((c for k, c in per.items() if 0.25 <= k < 0.5), Count())),
            "50%~75%": self._score_accuracy(sum((c for k, c in per.items() if 0.5 <= k < 0.75), Count())),
            ">75%": self._score_accuracy(sum((c for k, c in per.items() if 0.75 <= k), Count())),
        }

        # Group scores by with_typo / without_typo
        per = defaultdict(Count)
        for domain, d1 in self.acp_c.items():
            for has_typo, d2 in d1.items():
                for compl_rate, c in d2.items():
                    per[has_typo] += c
        per_other = {k: self._score_accuracy(per[k]) for k in [WITHOUT_TYPO, WITH_TYPO]}

        # Task results
        acp = {
            "score": self._score_accuracy(total_c),
            "per_domain": per_domain,
            "per_completion_rate": per_compl_rate,
            "per_other": per_other,
            "performances": self._score_performances(self.acp_memories, self.acp_runtimes),
        }

        # --- Auto-correction ---
        # Group scores by domain
        no_typo_per, typo_per = defaultdict(Count), defaultdict(Count)
        for domain, d1 in self.acr_c.items():
            for typo, c in d1.items():
                if typo is None:
                    no_typo_per[domain] += c
                else:
                    typo_per[domain] += c
        no_typo_total_c = sum(no_typo_per.values(), Count())
        typo_total_c = sum(typo_per.values(), Count())
        per_domain = {k: self._score_precision_recall(no_typo_per[k], typo_per[k], beta=beta) for k in no_typo_per}

        # Group scores by typo type
        no_typo_c, typo_per = Count(), defaultdict(Count)
        for domain, d1 in self.acr_c.items():
            for typo, c in d1.items():
                if typo is None:
                    no_typo_c += c
                else:
                    typo_per[typo] += c
        # Divide the total count of no-typo into each type of typos with the right proportions
        no_typo_per = defaultdict(Count, {k: no_typo_c * (c.total / typo_total_c.total) for k, c in typo_per.items()})
        per_typo_type = {t.name: self._score_precision_recall(no_typo_per[t], typo_per[t], beta=beta) for t in Typo}
        per_n_typo = {
            "1": self._score_precision_recall(
                sum((c for k, c in no_typo_per.items() if isinstance(k, Typo)), Count()),
                sum((c for k, c in typo_per.items() if isinstance(k, Typo)), Count()),
                beta=beta,
            ),
            "2": self._score_precision_recall(no_typo_per[2], typo_per[2], beta=beta),
            "3+": self._score_precision_recall(
                sum((c for k, c in no_typo_per.items() if isinstance(k, int) and k > 2), Count()),
                sum((c for k, c in typo_per.items() if isinstance(k, int) and k > 2), Count()),
                beta=beta,
            ),
        }

        # Task results
        acr = {
            "score": self._score_precision_recall(no_typo_total_c, typo_total_c, beta=beta),
            "per_domain": per_domain,
            "per_typo_type": per_typo_type,
            "per_number_of_typos": per_n_typo,
            "performances": self._score_performances(self.acr_memories, self.acr_runtimes),
        }

        # --- Swipe resolution ---
        # Group scores by domain
        per = defaultdict(Count)
        for domain, c in self.swp_c.items():
            per[domain] += c
        total_c = sum(per.values(), Count())
        per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

        # Task results
        swp = {
            "score": self._score_accuracy(total_c),
            "per_domain": per_domain,
            "performances": self._score_performances(self.swp_memories, self.swp_runtimes),
        }

        # Final results
        results = {
            "next_word_prediction": nwp,
            "auto_completion": acp,
            "auto_correction": acr,
            "swipe_resolution": swp,
        }

        # Add the overall score
        results["overall_score"] = one_score(results)

        return results

add(scorer)

Method to update the current Scorer with the counts from another Scorer.

Parameters:

Name Type Description Default
scorer Scorer

Scorer to add.

required
Source code in kebbie/scorer.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def add(self, scorer) -> None:
    """Method to update the current Scorer with the counts from another
    Scorer.

    Args:
        scorer (Scorer): Scorer to add.
    """

    def update(d1, d2):
        for k in d2:
            if isinstance(d2[k], Count):
                d1[k] += d2[k]
            else:
                update(d1[k], d2[k])

    update(self.nwp_c, scorer.nwp_c)
    update(self.acp_c, scorer.acp_c)
    update(self.acr_c, scorer.acr_c)
    update(self.swp_c, scorer.swp_c)
    self.nwp_memories.extend(scorer.nwp_memories)
    self.acp_memories.extend(scorer.acp_memories)
    self.acr_memories.extend(scorer.acr_memories)
    self.swp_memories.extend(scorer.swp_memories)
    self.nwp_runtimes.extend(scorer.nwp_runtimes)
    self.acp_runtimes.extend(scorer.acp_runtimes)
    self.acr_runtimes.extend(scorer.acr_runtimes)
    self.swp_runtimes.extend(scorer.swp_runtimes)
    self.nwp_mistakes.update(scorer.nwp_mistakes)
    self.acp_mistakes.update(scorer.acp_mistakes)
    self.acr_mistakes.update(scorer.acr_mistakes)
    self.swp_mistakes.update(scorer.swp_mistakes)

nwp(true_word, predicted_words, context, memory, runtime, domain=None)

Method used to record a prediction for the next-word prediction task.

Parameters:

Name Type Description Default
true_word str

The label (clean word to predict).

required
predicted_words List[str]

Predictions of the model.

required
context str

The context (previous words in the sentence).

required
memory int

Memory consumption for the call of the model.

required
runtime int

Runtime for the call of the model.

required
domain str

Domain of this prediction.

None
Source code in kebbie/scorer.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def nwp(
    self,
    true_word: str,
    predicted_words: List[str],
    context: str,
    memory: int,
    runtime: int,
    domain: Optional[str] = None,
) -> None:
    """Method used to record a prediction for the next-word prediction
    task.

    Args:
        true_word (str): The label (clean word to predict).
        predicted_words (List[str]): Predictions of the model.
        context (str): The context (previous words in the sentence).
        memory (int): Memory consumption for the call of the model.
        runtime (int): Runtime for the call of the model.
        domain (str): Domain of this prediction.
    """
    # Record memory & runtime
    if memory >= 0:
        self.nwp_memories.append(memory)
    if runtime >= 0:
        self.nwp_runtimes.append(runtime)

    # Record counts
    if len(predicted_words) > 0 and predicted_words[0] == true_word:
        self.nwp_c[domain].correct += 1
    if true_word in predicted_words[:3]:
        self.nwp_c[domain].correct_3 += 1
    else:
        # If the word is not in the top-3 predictions, this is a mistake
        if self.track_mistakes:
            self.nwp_mistakes.update([Mistake(actual=true_word, preds=predicted_words[:3], context=context)])

    self.nwp_c[domain].total += 1

acp(true_word, predicted_words, partial_word, context, memory, runtime, domain=None)

Method used to record a prediction for the auto-completion task.

Parameters:

Name Type Description Default
true_word str

The label (clean word to predict).

required
predicted_words List[str]

Predictions of the model.

required
partial_word str

The input sent to the model (only part of the word to predict, with potential typos).

required
context str

The context (previous words in the sentence).

required
memory int

Memory consumption for the call of the model.

required
runtime int

Runtime for the call of the model.

required
domain str

Domain of this prediction.

None
Source code in kebbie/scorer.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def acp(
    self,
    true_word: str,
    predicted_words: List[str],
    partial_word: str,
    context: str,
    memory: int,
    runtime: int,
    domain: Optional[str] = None,
) -> None:
    """Method used to record a prediction for the auto-completion task.

    Args:
        true_word (str): The label (clean word to predict).
        predicted_words (List[str]): Predictions of the model.
        partial_word (str): The input sent to the model (only part of the
            word to predict, with potential typos).
        context (str): The context (previous words in the sentence).
        memory (int): Memory consumption for the call of the model.
        runtime (int): Runtime for the call of the model.
        domain (str): Domain of this prediction.
    """
    # Record memory & runtime
    if memory >= 0:
        self.acp_memories.append(memory)
    if runtime >= 0:
        self.acp_runtimes.append(runtime)

    # Check if a typo was introduced or not
    has_typo = WITHOUT_TYPO if true_word.startswith(partial_word) else WITH_TYPO

    # Compute the completion rate
    completion_rate = round(len(partial_word) / len(true_word), 2)

    # Record counts
    if len(predicted_words) > 0 and predicted_words[0] == true_word:
        self.acp_c[domain][has_typo][completion_rate].correct += 1
    if true_word in predicted_words[:3]:
        self.acp_c[domain][has_typo][completion_rate].correct_3 += 1
    else:
        # If the word is not in the top-3 predictions, this is a mistake
        if self.track_mistakes:
            self.acp_mistakes.update(
                [Mistake(actual=true_word, preds=predicted_words[:3], context=f"{context}{partial_word}")]
            )

    self.acp_c[domain][has_typo][completion_rate].total += 1

acr(true_word, predicted_words, typed_word, context, typos, memory, runtime, domain=None)

Method used to record a prediction for the auto-correction task.

Parameters:

Name Type Description Default
true_word str

The label (clean word to predict).

required
predicted_words List[str]

Predictions of the model.

required
typed_word str

The word typed, containing potential typos.

required
context str

The context (previous words in the sentence).

required
typos List[Typo]

List of typos introduced.

required
memory int

Memory consumption for the call of the model.

required
runtime int

Runtime for the call of the model.

required
domain str

Domain of this prediction.

None
Source code in kebbie/scorer.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def acr(
    self,
    true_word: str,
    predicted_words: List[str],
    typed_word: str,
    context: str,
    typos: List[Typo],
    memory: int,
    runtime: int,
    domain: Optional[str] = None,
) -> None:
    """Method used to record a prediction for the auto-correction task.

    Args:
        true_word (str): The label (clean word to predict).
        predicted_words (List[str]): Predictions of the model.
        typed_word (str): The word typed, containing potential typos.
        context (str): The context (previous words in the sentence).
        typos (List[Typo]): List of typos introduced.
        memory (int): Memory consumption for the call of the model.
        runtime (int): Runtime for the call of the model.
        domain (str): Domain of this prediction.
    """
    # Record memory & runtime
    if memory >= 0:
        self.acr_memories.append(memory)
    if runtime >= 0:
        self.acr_runtimes.append(runtime)

    # Get the type of typo
    if not typos:
        typo_type = None
    elif len(typos) == 1:
        typo_type = typos[0]
    else:
        typo_type = len(typos)

    # Record counts
    if len(predicted_words) > 0 and predicted_words[0] == true_word:
        self.acr_c[domain][typo_type].correct += 1
    if true_word in predicted_words[:3]:
        self.acr_c[domain][typo_type].correct_3 += 1
    else:
        # If the word is not in the top-3 predictions, this is a mistake
        if self.track_mistakes:
            self.acr_mistakes.update(
                [Mistake(actual=true_word, preds=predicted_words[:3], context=f"{context}{typed_word}")]
            )

    self.acr_c[domain][typo_type].total += 1

swp(true_word, predicted_words, context, memory, runtime, domain=None)

Method used to record a prediction for the swipe resolution task.

Parameters:

Name Type Description Default
true_word str

The label (clean word to predict).

required
predicted_words List[str]

Predictions of the model.

required
context str

The context (previous words in the sentence).

required
memory int

Memory consumption for the call of the model.

required
runtime int

Runtime for the call of the model.

required
domain str

Domain of this prediction.

None
Source code in kebbie/scorer.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def swp(
    self,
    true_word: str,
    predicted_words: List[str],
    context: str,
    memory: int,
    runtime: int,
    domain: Optional[str] = None,
) -> None:
    """Method used to record a prediction for the swipe resolution task.

    Args:
        true_word (str): The label (clean word to predict).
        predicted_words (List[str]): Predictions of the model.
        context (str): The context (previous words in the sentence).
        memory (int): Memory consumption for the call of the model.
        runtime (int): Runtime for the call of the model.
        domain (str): Domain of this prediction.
    """
    # Record memory & runtime
    if memory >= 0:
        self.swp_memories.append(memory)
    if runtime >= 0:
        self.swp_runtimes.append(runtime)

    # Record counts
    if len(predicted_words) > 0 and predicted_words[0] == true_word:
        self.swp_c[domain].correct += 1
    if true_word in predicted_words[:3]:
        self.swp_c[domain].correct_3 += 1
    else:
        # If the word is not in the top-3 predictions, this is a mistake
        if self.track_mistakes:
            self.swp_mistakes.update([Mistake(actual=true_word, preds=predicted_words[:3], context=context)])

    self.swp_c[domain].total += 1

set_domain(domain)

Method setting the domain for the scores associated with no domain.

To make it easier to score a single sentence, it's possible to call the scorer without a domain (see signature of nwp(), acp(), acr()). In this case the scores are associated to no domain (None key). This method allows the user to set the domain name for these scores with no domain (effectively moving the None domain scores to the given domain name).

Note

If some scores were already linked to the given domain, these scores will be erased (replaced by the scores of the None domain).

Parameters:

Name Type Description Default
domain str

Domain name to associate the scores to.

required
Source code in kebbie/scorer.py
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def set_domain(self, domain: str) -> None:
    """Method setting the domain for the scores associated with no domain.

    To make it easier to score a single sentence, it's possible to call the
    scorer without a domain (see signature of `nwp()`, `acp()`, `acr()`).
    In this case the scores are associated to no domain (`None` key).
    This method allows the user to set the domain name for these scores
    with no domain (effectively moving the `None` domain scores to the
    given domain name).

    Note:
        If some scores were already linked to the given domain, these
        scores will be erased (replaced by the scores of the `None`
        domain).

    Args:
        domain (str): Domain name to associate the scores to.
    """
    if None in self.nwp_c:
        self.nwp_c[domain] = self.nwp_c.pop(None)
    if None in self.acp_c:
        self.acp_c[domain] = self.acp_c.pop(None)
    if None in self.acr_c:
        self.acr_c[domain] = self.acr_c.pop(None)
    if None in self.swp_c:
        self.swp_c[domain] = self.swp_c.pop(None)

_score_accuracy(c)

Helper method to compute the accuracy given a prediction count.

This method return a dictionary with 3 metrics
  • Accuracy
  • Top3 accuracy
  • Total number of predictions

Parameters:

Name Type Description Default
c Count

Count object to use to compute the accuracy.

required

Returns:

Type Description
Dict

Dictionary with the computed metrics.

Source code in kebbie/scorer.py
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
def _score_accuracy(self, c: Count) -> Dict:
    """Helper method to compute the accuracy given a prediction count.

    This method return a dictionary with 3 metrics :
     * Accuracy
     * Top3 accuracy
     * Total number of predictions

    Args:
        c (Count): Count object to use to compute the accuracy.

    Returns:
        Dictionary with the computed metrics.
    """
    return {
        "accuracy": round_to_n(c.correct / c.total) if c.total != 0 else 0,
        "top3_accuracy": round_to_n(c.correct_3 / c.total) if c.total != 0 else 0,
        "n": c.total,
    }

_score_precision_recall(no_typo_c, typo_c, beta)

Helper method to compute the precision and recall for auto-correction.

This method return a dictionary with several metrics
  • Accuracy
  • Precision
  • Recall
  • F-score
  • Top3 accuracy
  • Top3 precision
  • Top3 recall
  • Top3 F-score
  • Number of predictions with a typo
  • Total number of predictions

For auto-correction, we need 2 Count objects : the counts of typos, and the counts of non-typo (to compute the True Negative and False Positive metrics).

Parameters:

Name Type Description Default
no_typo_c Count

Count object for the predictions where no typo were added.

required
typo_c Count

Count object for the predictions where typos were added.

required
beta float

Beta to use for computing the F-beta score.

required

Returns:

Type Description
Dict

Dictionary with the computed metrics.

Source code in kebbie/scorer.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
def _score_precision_recall(self, no_typo_c: Count, typo_c: Count, beta: float) -> Dict:
    """Helper method to compute the precision and recall for
    auto-correction.

    This method return a dictionary with several metrics :
     * Accuracy
     * Precision
     * Recall
     * F-score
     * Top3 accuracy
     * Top3 precision
     * Top3 recall
     * Top3 F-score
     * Number of predictions with a typo
     * Total number of predictions

    For auto-correction, we need 2 Count objects : the counts of typos, and
    the counts of non-typo (to compute the True Negative and False Positive
    metrics).

    Args:
        no_typo_c (Count): Count object for the predictions where no typo
            were added.
        typo_c (Count): Count object for the predictions where typos were
            added.
        beta (float): Beta to use for computing the F-beta score.

    Returns:
        Dictionary with the computed metrics.
    """
    # The first step is to divide the counts into TN, FP, TP, FN
    tn = no_typo_c.correct
    fp = no_typo_c.total - no_typo_c.correct
    tp = typo_c.correct
    fn = typo_c.total - typo_c.correct

    tn_3 = no_typo_c.correct_3
    fp_3 = no_typo_c.total - no_typo_c.correct_3
    tp_3 = typo_c.correct_3
    fn_3 = typo_c.total - typo_c.correct_3

    # Then we compute the metrics
    p = precision(tp=tp, fp=fp)
    r = recall(tp=tp, fn=fn)

    p_3 = precision(tp=tp_3, fp=fp_3)
    r_3 = recall(tp=tp_3, fn=fn_3)

    return {
        "accuracy": round_to_n(accuracy(tp=tp, tn=tn, fp=fp, fn=fn)),
        "precision": round_to_n(p),
        "recall": round_to_n(r),
        "fscore": round_to_n(fbeta(precision=p, recall=r, beta=beta)),
        "top3_accuracy": round_to_n(accuracy(tp=tp_3, tn=tn_3, fp=fp_3, fn=fn_3)),
        "top3_precision": round_to_n(p_3),
        "top3_recall": round_to_n(r_3),
        "top3_fscore": round_to_n(fbeta(precision=p_3, recall=r_3, beta=beta)),
        "n_typo": typo_c.total,
        "n": no_typo_c.total + typo_c.total,
    }

_score_performances(memories, runtimes)

Helper method to compute metrics related to the memory & runtime.

This method returns a dictionary with several metrics
  • The mean memory consumption
  • The min memory consumption
  • The max memory consumption
  • The mean running time
  • The fastest running time
  • The slowest running time

Parameters:

Name Type Description Default
memories List[int]

List of memories consumptions for a specific operation.

required
runtimes List[int]

List of runtimes for a specific operation.

required

Returns:

Type Description
Dict

Dictionary with the computed metrics.

Source code in kebbie/scorer.py
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def _score_performances(self, memories: List[int], runtimes: List[int]) -> Dict:
    """Helper method to compute metrics related to the memory & runtime.

    This method returns a dictionary with several metrics :
     * The mean memory consumption
     * The min memory consumption
     * The max memory consumption
     * The mean running time
     * The fastest running time
     * The slowest running time

    Args:
        memories (List[int]): List of memories consumptions for a
            specific operation.
        runtimes (List[int]): List of runtimes for a specific operation.

    Returns:
        Dictionary with the computed metrics.
    """
    perf = {
        "mean_memory": stats.mean(memories) if memories else 0,
        "min_memory": min(memories) if memories else 0,
        "max_memory": max(memories) if memories else 0,
        "mean_runtime": stats.mean(runtimes) if runtimes else 0,
        "fastest_runtime": min(runtimes) if runtimes else 0,
        "slowest_runtime": max(runtimes) if runtimes else 0,
    }

    if self.human_readable:
        perf = {
            name: human_readable_memory(x) if name.endswith("memory") else human_readable_runtime(x)
            for name, x in perf.items()
        }

    return perf

score(beta=DEFAULT_BETA)

Method that computes the final scores (as well as some alternative metrics that can bring insight in the capabilities of the model), and output these in an organized dictionary.

Parameters:

Name Type Description Default
beta float

Beta to use for computing the F-beta score.

DEFAULT_BETA

Returns:

Type Description
Dict

Dictionary containing the computed scores and metrics for the

Dict

model tested.

Source code in kebbie/scorer.py
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
def score(self, beta: float = DEFAULT_BETA) -> Dict:  # noqa: C901
    """Method that computes the final scores (as well as some alternative
    metrics that can bring insight in the capabilities of the model), and
    output these in an organized dictionary.

    Args:
        beta (float, optional): Beta to use for computing the F-beta score.

    Returns:
        Dictionary containing the computed scores and metrics for the
        model tested.
    """
    # --- Next-word prediction ---
    # Group scores by domain
    per = defaultdict(Count)
    for domain, c in self.nwp_c.items():
        per[domain] += c
    total_c = sum(per.values(), Count())
    per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

    # Task results
    nwp = {
        "score": self._score_accuracy(total_c),
        "per_domain": per_domain,
        "performances": self._score_performances(self.nwp_memories, self.nwp_runtimes),
    }

    # --- Auto-completion ---
    # Group scores by domain
    per = defaultdict(Count)
    for domain, d1 in self.acp_c.items():
        for has_typo, d2 in d1.items():
            for compl_rate, c in d2.items():
                per[domain] += c
    total_c = sum(per.values(), Count())
    per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

    # Group scores by completion rate
    per = defaultdict(Count)
    for domain, d1 in self.acp_c.items():
        for has_typo, d2 in d1.items():
            for compl_rate, c in d2.items():
                per[compl_rate] += c
    per_compl_rate = {
        "<25%": self._score_accuracy(sum((c for k, c in per.items() if k < 0.25), Count())),
        "25%~50%": self._score_accuracy(sum((c for k, c in per.items() if 0.25 <= k < 0.5), Count())),
        "50%~75%": self._score_accuracy(sum((c for k, c in per.items() if 0.5 <= k < 0.75), Count())),
        ">75%": self._score_accuracy(sum((c for k, c in per.items() if 0.75 <= k), Count())),
    }

    # Group scores by with_typo / without_typo
    per = defaultdict(Count)
    for domain, d1 in self.acp_c.items():
        for has_typo, d2 in d1.items():
            for compl_rate, c in d2.items():
                per[has_typo] += c
    per_other = {k: self._score_accuracy(per[k]) for k in [WITHOUT_TYPO, WITH_TYPO]}

    # Task results
    acp = {
        "score": self._score_accuracy(total_c),
        "per_domain": per_domain,
        "per_completion_rate": per_compl_rate,
        "per_other": per_other,
        "performances": self._score_performances(self.acp_memories, self.acp_runtimes),
    }

    # --- Auto-correction ---
    # Group scores by domain
    no_typo_per, typo_per = defaultdict(Count), defaultdict(Count)
    for domain, d1 in self.acr_c.items():
        for typo, c in d1.items():
            if typo is None:
                no_typo_per[domain] += c
            else:
                typo_per[domain] += c
    no_typo_total_c = sum(no_typo_per.values(), Count())
    typo_total_c = sum(typo_per.values(), Count())
    per_domain = {k: self._score_precision_recall(no_typo_per[k], typo_per[k], beta=beta) for k in no_typo_per}

    # Group scores by typo type
    no_typo_c, typo_per = Count(), defaultdict(Count)
    for domain, d1 in self.acr_c.items():
        for typo, c in d1.items():
            if typo is None:
                no_typo_c += c
            else:
                typo_per[typo] += c
    # Divide the total count of no-typo into each type of typos with the right proportions
    no_typo_per = defaultdict(Count, {k: no_typo_c * (c.total / typo_total_c.total) for k, c in typo_per.items()})
    per_typo_type = {t.name: self._score_precision_recall(no_typo_per[t], typo_per[t], beta=beta) for t in Typo}
    per_n_typo = {
        "1": self._score_precision_recall(
            sum((c for k, c in no_typo_per.items() if isinstance(k, Typo)), Count()),
            sum((c for k, c in typo_per.items() if isinstance(k, Typo)), Count()),
            beta=beta,
        ),
        "2": self._score_precision_recall(no_typo_per[2], typo_per[2], beta=beta),
        "3+": self._score_precision_recall(
            sum((c for k, c in no_typo_per.items() if isinstance(k, int) and k > 2), Count()),
            sum((c for k, c in typo_per.items() if isinstance(k, int) and k > 2), Count()),
            beta=beta,
        ),
    }

    # Task results
    acr = {
        "score": self._score_precision_recall(no_typo_total_c, typo_total_c, beta=beta),
        "per_domain": per_domain,
        "per_typo_type": per_typo_type,
        "per_number_of_typos": per_n_typo,
        "performances": self._score_performances(self.acr_memories, self.acr_runtimes),
    }

    # --- Swipe resolution ---
    # Group scores by domain
    per = defaultdict(Count)
    for domain, c in self.swp_c.items():
        per[domain] += c
    total_c = sum(per.values(), Count())
    per_domain = {k: self._score_accuracy(c) for k, c in per.items()}

    # Task results
    swp = {
        "score": self._score_accuracy(total_c),
        "per_domain": per_domain,
        "performances": self._score_performances(self.swp_memories, self.swp_runtimes),
    }

    # Final results
    results = {
        "next_word_prediction": nwp,
        "auto_completion": acp,
        "auto_correction": acr,
        "swipe_resolution": swp,
    }

    # Add the overall score
    results["overall_score"] = one_score(results)

    return results

dd_x_layers(n_layers=1)

Helper function for creating a nested defaultdict, with a specified number of nest level. The end object is a Count.

Parameters:

Name Type Description Default
n_layers int

Number of layer for the defaultdict.

1

Returns:

Type Description
defaultdict

Created nested defaultdict.

Source code in kebbie/scorer.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def dd_x_layers(n_layers: int = 1) -> defaultdict:
    """Helper function for creating a nested defaultdict, with a specified
    number of nest level. The end object is a Count.

    Args:
        n_layers (int): Number of layer for the defaultdict.

    Returns:
        Created nested defaultdict.
    """
    assert n_layers > 0, f"A default dict have at least 1 layer ({n_layers} given)"
    if n_layers == 1:
        return defaultdict(Count)
    else:
        return defaultdict(partial(dd_x_layers, n_layers=n_layers - 1))

one_score(results)

One Score to rule them all, One Score to find them, One Score to bring them all and in the darkness bind them.

This function is here to gather the various testing metrics of a JET file in a single number, to easily compare models.

We take a single metric for each task, and weight them based on the importance of the task (these metrics already have the same scale : between 0 and 1).

For NWP and ACP we take a top-3 metric, because these tasks usually involve a user action from a proposed list. For ACR and SWP, we take a top-1 metric, since usually it's automatically applied without user input.

Parameters:

Name Type Description Default
results Dict

Testing results. Should be a dictionary containing all the metrics (used to compute the one score).

required

Returns:

Type Description
float

One score, computed from the results given.

Source code in kebbie/scorer.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def one_score(results: Dict) -> float:
    """One Score to rule them all, One Score to find them, One Score to bring
    them all and in the darkness bind them.

    This function is here to gather the various testing metrics of a JET file
    in a single number, to easily compare models.

    We take a single metric for each task, and weight them based on the
    importance of the task (these metrics already have the same scale : between
    0 and 1).

    For NWP and ACP we take a top-3 metric, because these tasks usually involve
    a user action from a proposed list. For ACR and SWP, we take a top-1
    metric, since usually it's automatically applied without user input.

    Args:
        results (Dict): Testing results. Should be a dictionary containing all
            the metrics (used to compute the one score).

    Returns:
        One score, computed from the results given.
    """
    nwp = results["next_word_prediction"]["score"]["top3_accuracy"]
    acp = results["auto_completion"]["score"]["top3_accuracy"]
    acr = results["auto_correction"]["score"]["fscore"]
    swp = results["swipe_resolution"]["score"]["accuracy"]

    return 0.15 * nwp + 0.2 * acp + 0.4 * acr + 0.25 * swp

tokenizer.py

Module defining BasicTokenizer, very basic tokenizer to separate a sentence into words.

BasicTokenizer

A basic tokenizer, used for regular latin languages. This tokenizer simply use space as word separator. Since it is used for testing only, we don't need to care about punctuations, etc...

Source code in kebbie/tokenizer.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class BasicTokenizer:
    """A basic tokenizer, used for regular latin languages.
    This tokenizer simply use space as word separator. Since it is used for
    testing only, we don't need to care about punctuations, etc...
    """

    def preprocess(self, sentence: str) -> str:
        """Method for simple preprocessing.

        The goal of this function is not to provide an extensive and clean
        preprocessing. The goal is just to normalize some characters (that
        are not in our keyboard, so the user can't officially type them) into
        their normal counterpart, that are in the keyboard.

        Args:
            sentence (str): String to normalize.

        Returns:
            Normalized string.
        """
        # Replace things that are like "
        sentence = sentence.replace("“", '"').replace("”", '"').replace("„", '"')

        # Replace things that are like '
        sentence = sentence.replace("’", "'").replace("ʻ", "'").replace("‘", "'").replace("´", "'").replace("ʼ", "'")

        # Replace things that are like -
        sentence = sentence.replace("–", "-").replace("—", "-").replace("‑", "-").replace("−", "-").replace("ー", "-")

        # Replace other punctuations
        sentence = sentence.replace("…", "...").replace("‚", ",").replace("․", ".")

        # TODO: Each keyboard has its own way to deal with punctuation
        # (applying auto-correction or not, displaying next-word prediction or
        # not, etc...). So for now we just get rid of the punctuations, it's a
        # convenient shortcut and it's fair to all keyboards.
        # Eventually we should find a better way to deal with that.
        sentence = re.sub(r"\s*\.+\s*", " ", sentence)
        sentence = re.sub(r"\s*[,:;\(\)\"!?\[\]\{\}~]\s*", " ", sentence)

        return sentence

    def word_split(self, sentence: str) -> List[str]:
        """Method for splitting a sentence into a list of words.

        Args:
            sentence (str): Sentence to split.

        Returns:
            List of words from the sentence.
        """
        return sentence.strip().split()

    def update_context(self, context: str, word: str) -> str:
        """Method for updating a context, given a word that was typed.

        Args:
            context (str): Existing context.
            word (str): Word being typed.

        Returns:
            Updated context.
        """
        return context + word + " "

preprocess(sentence)

Method for simple preprocessing.

The goal of this function is not to provide an extensive and clean preprocessing. The goal is just to normalize some characters (that are not in our keyboard, so the user can't officially type them) into their normal counterpart, that are in the keyboard.

Parameters:

Name Type Description Default
sentence str

String to normalize.

required

Returns:

Type Description
str

Normalized string.

Source code in kebbie/tokenizer.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def preprocess(self, sentence: str) -> str:
    """Method for simple preprocessing.

    The goal of this function is not to provide an extensive and clean
    preprocessing. The goal is just to normalize some characters (that
    are not in our keyboard, so the user can't officially type them) into
    their normal counterpart, that are in the keyboard.

    Args:
        sentence (str): String to normalize.

    Returns:
        Normalized string.
    """
    # Replace things that are like "
    sentence = sentence.replace("“", '"').replace("”", '"').replace("„", '"')

    # Replace things that are like '
    sentence = sentence.replace("’", "'").replace("ʻ", "'").replace("‘", "'").replace("´", "'").replace("ʼ", "'")

    # Replace things that are like -
    sentence = sentence.replace("–", "-").replace("—", "-").replace("‑", "-").replace("−", "-").replace("ー", "-")

    # Replace other punctuations
    sentence = sentence.replace("…", "...").replace("‚", ",").replace("․", ".")

    # TODO: Each keyboard has its own way to deal with punctuation
    # (applying auto-correction or not, displaying next-word prediction or
    # not, etc...). So for now we just get rid of the punctuations, it's a
    # convenient shortcut and it's fair to all keyboards.
    # Eventually we should find a better way to deal with that.
    sentence = re.sub(r"\s*\.+\s*", " ", sentence)
    sentence = re.sub(r"\s*[,:;\(\)\"!?\[\]\{\}~]\s*", " ", sentence)

    return sentence

word_split(sentence)

Method for splitting a sentence into a list of words.

Parameters:

Name Type Description Default
sentence str

Sentence to split.

required

Returns:

Type Description
List[str]

List of words from the sentence.

Source code in kebbie/tokenizer.py
51
52
53
54
55
56
57
58
59
60
def word_split(self, sentence: str) -> List[str]:
    """Method for splitting a sentence into a list of words.

    Args:
        sentence (str): Sentence to split.

    Returns:
        List of words from the sentence.
    """
    return sentence.strip().split()

update_context(context, word)

Method for updating a context, given a word that was typed.

Parameters:

Name Type Description Default
context str

Existing context.

required
word str

Word being typed.

required

Returns:

Type Description
str

Updated context.

Source code in kebbie/tokenizer.py
62
63
64
65
66
67
68
69
70
71
72
def update_context(self, context: str, word: str) -> str:
    """Method for updating a context, given a word that was typed.

    Args:
        context (str): Existing context.
        word (str): Word being typed.

    Returns:
        Updated context.
    """
    return context + word + " "

utils.py

Various utils function used by kebbie.

profile_fn(fn, *args, **kwargs)

Profile the runtime and memory usage of the given function.

Note that it will only account for memory allocated by python (if you use a library in C/C++ that does its own allocation, it won't report it).

Parameters:

Name Type Description Default
fn Callable

Function to profile.

required
*args Any

Positional arguments to pass to the given function.

()
**kwargs Any

Keywords arguments to pass to the given function.

{}

Returns:

Type Description
Any

The return value of the function called.

int

The memory usage (in bytes).

int

The runtime (in nano seconds).

Source code in kebbie/utils.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def profile_fn(fn: Callable, *args: Any, **kwargs: Any) -> Tuple[Any, int, int]:
    """Profile the runtime and memory usage of the given function.

    Note that it will only account for memory allocated by python (if you use
    a library in C/C++ that does its own allocation, it won't report it).

    Args:
        fn (Callable): Function to profile.
        *args: Positional arguments to pass to the given function.
        **kwargs: Keywords arguments to pass to the given function.

    Returns:
        The return value of the function called.
        The memory usage (in bytes).
        The runtime (in nano seconds).
    """
    tracemalloc.start()
    t0 = time.time()

    result = fn(*args, **kwargs)

    runtime = time.time() - t0
    _, memory = tracemalloc.get_traced_memory()

    return result, memory, runtime * SEC_TO_NANOSEC

euclidian_dist(p1, p2)

Function computing the euclidian distance between 2 points.

Parameters:

Name Type Description Default
p1 Tuple[float, float]

Point 1.

required
p2 Tuple[float, float]

Point 2.

required

Returns:

Type Description
float

Euclidian distance between the 2 given points.

Source code in kebbie/utils.py
45
46
47
48
49
50
51
52
53
54
55
def euclidian_dist(p1: Tuple[float, float], p2: Tuple[float, float]) -> float:
    """Function computing the euclidian distance between 2 points.

    Args:
        p1 (Tuple[float, float]): Point 1.
        p2 (Tuple[float, float]): Point 2.

    Returns:
        Euclidian distance between the 2 given points.
    """
    return math.sqrt(sum((a - b) ** 2 for a, b in zip(p1, p2)))

load_keyboard(lang='en-US')

Load the keyboard data for the given language.

For now, only en-US is supported.

Parameters:

Name Type Description Default
lang str

Language of the keyboard to load.

'en-US'

Returns:

Type Description
Dict

The keyboard data.

Source code in kebbie/utils.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def load_keyboard(lang: str = "en-US") -> Dict:
    """Load the keyboard data for the given language.

    For now, only `en-US` is supported.

    Args:
        lang (str, optional): Language of the keyboard to load.

    Returns:
        The keyboard data.
    """
    layout_folder = Path(__file__).parent / "layouts"
    with open(layout_folder / f"{lang}.json", "r") as f:
        keyboard = json.load(f)
    return keyboard

strip_accents(s)

Util function for removing accents from a given string.

Parameters:

Name Type Description Default
s str

Accented string.

required

Returns:

Type Description
str

Same string, without accent.

Source code in kebbie/utils.py
75
76
77
78
79
80
81
82
83
84
85
def strip_accents(s: str) -> str:
    """Util function for removing accents from a given string.

    Args:
        s (str): Accented string.

    Returns:
        Same string, without accent.
    """
    nfkd_form = unicodedata.normalize("NFKD", s)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

sample(proba)

Simple function to sample an event with the given probability. For example, calling sample(0.95) will return True in 95% cases, and False in 5% cases.

Parameters:

Name Type Description Default
proba float

Probability of the event to happen. Should be between 0 and 1 (included).

required

Returns:

Type Description
bool

True if the event was sampled, False otherwise.

Source code in kebbie/utils.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def sample(proba: float) -> bool:
    """Simple function to sample an event with the given probability.
    For example, calling `sample(0.95)` will return `True` in 95% cases, and
    `False` in 5% cases.

    Args:
        proba (float): Probability of the event to happen. Should be between 0
            and 1 (included).

    Returns:
        `True` if the event was sampled, `False` otherwise.
    """
    assert 0 <= proba <= 1, f"`{proba}` is not a valid probability (should be between 0 and 1)"
    if proba == 0:
        return False
    elif proba == 1:
        return True
    else:
        return random.choices([True, False], weights=[proba, 1 - proba])[0]

sample_among(probs, with_none=True)

Function that sample an event among several with different probabilities.

Parameters:

Name Type Description Default
probs Dict[Any, float]

Dictionary representing the different events and their probabilities. Each probability should be above 0 and their sum should not exceed 1.

required
with_none bool

If set to True, add a None option (no event sampled).

True

Returns:

Type Description
Any

The corresponding key of the event sampled.

Source code in kebbie/utils.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def sample_among(probs: Dict[Any, float], with_none: bool = True) -> Any:
    """Function that sample an event among several with different
    probabilities.

    Args:
        probs (Dict[Any, float]): Dictionary representing the different events
            and their probabilities. Each probability should be above 0 and
            their sum should not exceed 1.
        with_none (bool): If set to `True`, add a `None` option (no event
            sampled).

    Returns:
        The corresponding key of the event sampled.
    """
    options = list(probs.keys())
    weights = list(probs.values())
    assert (
        all(w >= 0 for w in weights) and sum(weights) <= 1
    ), "The numbers given are not a probability (should be above 0 and their sum should not exceed 1)"

    if with_none:
        options.append(None)
        weights.append(1 - sum(weights))

    return random.choices(options, weights=weights)[0]

sample_partial_word(keystrokes, word, true_word)

Sample a partial word from a given word, and extract the corresponding keystrokes as well.

Sampling is done with increasing weights (more chances to sample a longer list). For example if the list represent the keystrokes of "abcdef", the probabilities are as follow: * "a" : 1/15 * "ab" : 2/15 * "abc" : 3/15 * "abcd" : 4/15 * "abcde" : 5/15

Parameters:

Name Type Description Default
keystrokes List[Optional[Tuple[float, float]]]

Complete list of keystrokes, representing a full word.

required
word str

The word corresponding to the keystrokes.

required
true_word str

Actual word (without typo). Necessary to ensure the sampled keystrokes are partial.

required

Returns:

Type Description
List[Optional[Tuple[float, float]]]

The partial list of keystrokes (sampled from the given word).

str

The partial word (sampled from the given word).

Source code in kebbie/utils.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def sample_partial_word(
    keystrokes: List[Optional[Tuple[float, float]]], word: str, true_word: str
) -> Tuple[List[Optional[Tuple[float, float]]], str]:
    """Sample a partial word from a given word, and extract the corresponding
    keystrokes as well.

    Sampling is done with increasing weights (more chances to sample a longer
    list). For example if the list represent the keystrokes of "abcdef", the
    probabilities are as follow:
     * "a" :     1/15
     * "ab" :    2/15
     * "abc" :   3/15
     * "abcd" :  4/15
     * "abcde" : 5/15

    Args:
        keystrokes (List[Optional[Tuple[float, float]]]): Complete list of
            keystrokes, representing a full word.
        word (str): The word corresponding to the keystrokes.
        true_word (str): Actual word (without typo). Necessary to ensure the
            sampled keystrokes are partial.

    Returns:
        The partial list of keystrokes (sampled from the given word).
        The partial word (sampled from the given word).
    """
    r = range(1, min(len(true_word), len(word)))
    s = random.choices(r, weights=r)[0]
    return keystrokes[:s], word[:s]

accuracy(tp, tn, fp, fn)

Function computing the precision.

Parameters:

Name Type Description Default
tp int

Number of True Positive.

required
tn int

Number of True Negative.

required
fp int

Number of False Positive.

required
fn int

Number of False Negative.

required

Returns:

Type Description
float

Accuracy.

Source code in kebbie/utils.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def accuracy(tp: int, tn: int, fp: int, fn: int) -> float:
    """Function computing the precision.

    Args:
        tp (int): Number of True Positive.
        tn (int): Number of True Negative.
        fp (int): Number of False Positive.
        fn (int): Number of False Negative.

    Returns:
        Accuracy.
    """
    try:
        return (tp + tn) / (tp + tn + fp + fn)
    except ZeroDivisionError:
        return 0

precision(tp, fp)

Function computing the precision.

Parameters:

Name Type Description Default
tp int

Number of True Positive.

required
fp int

Number of False Positive.

required

Returns:

Type Description
float

Precision.

Source code in kebbie/utils.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def precision(tp: int, fp: int) -> float:
    """Function computing the precision.

    Args:
        tp (int): Number of True Positive.
        fp (int): Number of False Positive.

    Returns:
        Precision.
    """
    try:
        return tp / (tp + fp)
    except ZeroDivisionError:
        return 0

recall(tp, fn)

Function computing the recall.

Parameters:

Name Type Description Default
tp int

Number of True Positive.

required
fn int

Number of False Negative.

required

Returns:

Type Description
float

Recall.

Source code in kebbie/utils.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def recall(tp: int, fn: int) -> float:
    """Function computing the recall.

    Args:
        tp (int): Number of True Positive.
        fn (int): Number of False Negative.

    Returns:
        Recall.
    """
    try:
        return tp / (tp + fn)
    except ZeroDivisionError:
        return 0

fbeta(precision, recall, beta=1)

Function computing the F-beta score (which is a generalization of the F1 score).

The value of Beta changes how much we weight recall versus precision
  • For beta=0.5, Precision is twice as important as Recall
  • For beta=2, Recall is twice as important as Precision

Parameters:

Name Type Description Default
precision float

Precision.

required
recall float

Recall.

required
beta float

Beta factor.

1

Returns:

Type Description
float

F-beta score.

Source code in kebbie/utils.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def fbeta(precision: float, recall: float, beta: float = 1) -> float:
    """Function computing the F-beta score (which is a generalization of the
    F1 score).

    The value of Beta changes how much we weight recall versus precision:
     * For beta=0.5, Precision is twice as important as Recall
     * For beta=2, Recall is twice as important as Precision

    Args:
        precision (float): Precision.
        recall (float): Recall.
        beta (float): Beta factor.

    Returns:
        F-beta score.
    """
    try:
        return (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
    except ZeroDivisionError:
        return 0

round_to_n(x, n=2)

Util function to round a given number to n significant digits.

Parameters:

Name Type Description Default
x float

Number to round.

required
n int

Number of significant digits to use.

2

Returns:

Type Description
float

Rounded number.

Source code in kebbie/utils.py
239
240
241
242
243
244
245
246
247
248
249
def round_to_n(x: float, n: int = 2) -> float:
    """Util function to round a given number to n significant digits.

    Args:
        x (float): Number to round.
        n (int): Number of significant digits to use.

    Returns:
        Rounded number.
    """
    return round(x, -int(math.floor(math.log10(x))) + (n - 1)) if x != 0 else 0

human_readable_memory(x)

Given a number in bytes, return a human-readable string of this number, with the right unit.

Parameters:

Name Type Description Default
x int

Number in bytes.

required

Returns:

Type Description
str

Human-readable version of the given number, with the right unit.

Source code in kebbie/utils.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def human_readable_memory(x: int) -> str:
    """Given a number in bytes, return a human-readable string of this number,
    with the right unit.

    Args:
        x (int): Number in bytes.

    Returns:
        Human-readable version of the given number, with the right unit.
    """
    x = round_to_n(x, n=3)
    for unit in ["B", "KB", "MB", "GB"]:
        if x < 1000:
            return f"{x:g} {unit}"

        x /= 1000
    return f"{x:g} TB"

human_readable_runtime(x)

Given a number in nanoseconds, return a human-readable string of this number, with the right unit.

Parameters:

Name Type Description Default
x int

Number in nanoseconds.

required

Returns:

Type Description
str

Human-readable version of the given number, with the right unit.

Source code in kebbie/utils.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def human_readable_runtime(x: int) -> str:
    """Given a number in nanoseconds, return a human-readable string of this
    number, with the right unit.

    Args:
        x (int): Number in nanoseconds.

    Returns:
        Human-readable version of the given number, with the right unit.
    """
    x = round_to_n(x, n=3)
    for unit in ["ns", "μs", "ms"]:
        if x < 1000:
            return f"{x:g} {unit}"

        x /= 1000
    return f"{x:g} s"

get_soda_dataset(max_sentences=2000, seed=31)

Load the SODA dataset.

Parameters:

Name Type Description Default
max_sentences int

Maximum number of sentences in total in the dataset. They will be shared across domain (50% from the narrative domain, 50% from the dialogue domain).

2000
seed int

Seed to use when shuffling the dataset (since we don't use the whole dataset, it's better to shuffle it before extracting the X first sentences).

31

Returns:

Type Description
Dict[str, List[str]]

The dataset, separated into two domains : narrative and dialogue.

Source code in kebbie/utils.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def get_soda_dataset(max_sentences: int = 2_000, seed: int = 31) -> Dict[str, List[str]]:
    """Load the SODA dataset.

    Args:
        max_sentences (int, optional): Maximum number of sentences in total in
            the dataset. They will be shared across domain (50% from the
            `narrative` domain, 50% from the `dialogue` domain).
        seed (int, optional): Seed to use when shuffling the dataset (since we
            don't use the whole dataset, it's better to shuffle it before
            extracting the X first sentences).

    Returns:
        The dataset, separated into two domains : narrative and dialogue.
    """
    data = {"narrative": [], "dialogue": []}
    max_domain_sentences = max_sentences // 2

    hf_dataset = datasets.load_dataset("allenai/soda", split="test")
    hf_dataset = hf_dataset.shuffle(seed=seed)

    for sample in hf_dataset:
        if len(data["narrative"]) >= max_domain_sentences and len(data["dialogue"]) >= max_domain_sentences:
            break

        if len(data["narrative"]) < max_domain_sentences:
            data["narrative"].append(sample["narrative"])

        for sen in sample["dialogue"]:
            if len(data["dialogue"]) < max_domain_sentences:
                data["dialogue"].append(sen)

    return data

Constants

__init__.py

SUPPORTED_LANG = ['en-US']

N_MOST_COMMON_MISTAKES = 1000

DEFAULT_SEED = 42

emulator.py

ANDROID = 'android'

IOS = 'ios'

GBOARD = 'gboard'

TAPPA = 'tappa'

FLEKSY = 'fleksy'

KBKITPRO = 'kbkitpro'

KBKITOSS = 'kbkitoss'

SWIFTKEY = 'swiftkey'

YANDEX = 'yandex'

KEYBOARD_PACKAGE = {GBOARD: 'com.google.android.inputmethod.latin', SWIFTKEY: 'com.touchtype.swiftkey', YANDEX: 'ru.yandex.androidkeyboard', TAPPA: 'com.tappa.keyboard'}

ANDROID_CAPABILITIES = {'platformName': 'android', 'automationName': 'UiAutomator2', 'enableMultiWindows': True, 'deviceName': 'test', 'newCommandTimeout': 3600}

IOS_CAPABILITIES = {'platformName': 'iOS', 'automationName': 'XCUITest', 'udid': 'auto', 'xcodeOrgId': '8556JTA4X4', 'xcodeSigningId': 'iPhone Developer', 'useNewWDA': False, 'usePrebuiltWdDA': True, 'startIWDP': True, 'bundleId': 'com.apple.MobileSMS', 'newCommandTimeout': 3600}

BROWSER_PAD_URL = 'https://www.justnotepad.com'

ANDROID_TYPING_FIELD_CLASS_NAME = 'android.widget.EditText'

DUMMY_RECIPIENT = '0'

IOS_TYPING_FIELD_ID = 'messageBodyField'

IOS_START_CHAT_CLASS_NAME = 'XCUIElementTypeCell'

TESSERACT_CONFIG = '-c tessedit_char_blacklist=0123456789”:!@·$%&/()=.¿?'

PREDICTION_DELAY = 0.4

CONTENT_TO_IGNORE = ['Sticker', 'GIF', 'Clipboard', 'Settings', 'Back', 'Switch input method', 'Paste item', 'Close', 'paintpalette', 'Search Document', 'Microphone', 'gearshape', 'Next Locale', 'paintpalette', 'EmojiCategories/smileysAndPeople', 'EmojiCategories/animalsAndNature', 'EmojiCategories/foodAndDrink', 'EmojiCategories/activity', 'EmojiCategories/travelAndPlaces', 'EmojiCategories/objects', 'EmojiCategories/symbols', 'EmojiCategories/flags', 'Add', 'And', 'Are', '“A”', '🚀', 'Switch language.']

CONTENT_TO_RENAME = {'Shift': 'shift', 'Delete': 'backspace', 'Backspace': 'backspace', 'Space': 'spacebar', 'space': 'spacebar', 'Space.': 'spacebar', 'Emoji button': 'smiley', 'Emoji': 'smiley', 'Keyboard Type - emojis': 'smiley', 'Search': 'enter', 'return': 'enter', 'Enter': 'enter', 'Delete.': 'backspace', 'To symbols.': 'numbers', 'Return.': 'enter', 'Symbol keyboard': 'numbers', 'Symbols': 'numbers', 'Symbols and numbers': 'numbers', 'Keyboard Type - numeric': 'numbers', 'Voice input': 'mic', ',, alternatives available, Voice typing, long press to activate': 'mic', 'Close features menu': 'magic', 'Open features menu': 'magic', 'underline': '_', '&amp;': '&', 'ampersand': '&', 'Dash': '-', 'Plus': '+', 'Left parenthesis': '(', 'Right parenthesis': ')', 'slash': '/', 'Apostrophe': "'", 'Colon': ':', 'Semicolon': ';', 'Exclamation': '!', 'Question mark': '?', 'Letter keyboard': 'letters', 'Letters': 'letters', 'Keyboard Type - auto': 'letters', 'To letters.': 'letters', 'Digit keyboard': 'numbers', 'More symbols': 'shift', 'Keyboard Type - symbolic': 'shift', 'Double tap for uppercase': 'shift', 'Double tap for caps lock': 'shift', 'Uppercase key.': 'shift', 'Additional symbols.': 'shift', 'capital Q': 'Q', 'capital W': 'W', 'capital E': 'E', 'capital R': 'R', 'capital T': 'T', 'capital Y': 'Y', 'capital U': 'U', 'capital I': 'I', 'Capital I': 'I', 'capital O': 'O', 'capital P': 'P', 'capital A': 'A', 'capital S': 'S', 'capital D': 'D', 'capital F': 'F', 'capital G': 'G', 'capital H': 'H', 'capital J': 'J', 'capital K': 'K', 'capital L': 'L', 'capital Z': 'Z', 'capital X': 'X', 'capital C': 'C', 'capital V': 'V', 'capital B': 'B', 'capital N': 'N', 'capital M': 'M'}

FLEKSY_LAYOUT = {'keyboard_frame': [0, 517, 393, 266], 'lowercase': {'q': [0.007407407407407408, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'w': [0.10462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'e': [0.20462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'r': [0.30462962962962964, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 't': [0.4046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'y': [0.5046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'u': [0.6046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'i': [0.7046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'o': [0.8046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'p': [0.9046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'a': [0.05740740740740741, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 's': [0.15555555555555556, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'd': [0.25555555555555554, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'f': [0.35462962962962963, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'g': [0.4546296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'h': [0.5546296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'j': [0.6546296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'k': [0.7546296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'l': [0.8555555555555555, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'shift': [0.007407407407407408, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], 'z': [0.15555555555555556, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'x': [0.25555555555555554, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'c': [0.35462962962962963, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'v': [0.4546296296296296, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'b': [0.5546296296296296, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'n': [0.6546296296296297, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'm': [0.7546296296296297, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'backspace': [0.8555555555555555, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], 'numbers': [0.007407407407407408, 0.8080821917808219, 0.125, 0.1643835616438356], 'smiley': [0.14351851851851852, 0.8080821917808219, 0.10277777777777777, 0.1643835616438356], 'spacebar': [0.25555555555555554, 0.8080821917808219, 0.48703703703703705, 0.1643835616438356], '.': [0.7546296296296297, 0.8080821917808219, 0.1, 0.1643835616438356], 'enter': [0.8648148148148148, 0.8080821917808219, 0.12962962962962962, 0.1643835616438356]}, 'uppercase': {'Q': [0.007407407407407408, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'W': [0.10462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'E': [0.20462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'R': [0.30462962962962964, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'T': [0.4046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'Y': [0.5046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'U': [0.6046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'I': [0.7046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'O': [0.8046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'P': [0.9046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], 'A': [0.05740740740740741, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'S': [0.15555555555555556, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'D': [0.25555555555555554, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'F': [0.35462962962962963, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'G': [0.4546296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'H': [0.5546296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'J': [0.6546296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'K': [0.7546296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'L': [0.8555555555555555, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'shift': [0.007407407407407408, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], 'Z': [0.15555555555555556, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'X': [0.25555555555555554, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'C': [0.35462962962962963, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'V': [0.4546296296296296, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'B': [0.5546296296296296, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'N': [0.6546296296296297, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'M': [0.7546296296296297, 0.5994520547945206, 0.08796296296296297, 0.1643835616438356], 'backspace': [0.8555555555555555, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], 'numbers': [0.007407407407407408, 0.8080821917808219, 0.125, 0.1643835616438356], 'smiley': [0.14351851851851852, 0.8080821917808219, 0.10277777777777777, 0.1643835616438356], 'spacebar': [0.25555555555555554, 0.8080821917808219, 0.48703703703703705, 0.1643835616438356], '.': [0.7546296296296297, 0.8080821917808219, 0.1, 0.1643835616438356], 'enter': [0.8648148148148148, 0.8080821917808219, 0.12962962962962962, 0.1643835616438356]}, 'numbers': {'1': [0.007407407407407408, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '2': [0.10462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '3': [0.20462962962962963, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '4': [0.30462962962962964, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '5': [0.4046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '6': [0.5046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '7': [0.6046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '8': [0.7046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '9': [0.8046296296296296, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '0': [0.9046296296296297, 0.19356164383561644, 0.08796296296296297, 0.1643835616438356], '-': [0.007407407407407408, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '/': [0.10462962962962963, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], ':': [0.20462962962962963, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], ';': [0.30462962962962964, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '(': [0.4046296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], ')': [0.5046296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '$': [0.6046296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '&': [0.7046296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '@': [0.8046296296296296, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], '"': [0.9046296296296297, 0.4008219178082192, 0.08796296296296297, 0.1643835616438356], 'shift': [0.007407407407407408, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], ',': [0.3101851851851852, 0.5994520547945206, 0.12, 0.1643835616438356], '?': [0.44044444444444447, 0.5994520547945206, 0.12, 0.1643835616438356], '!': [0.5707037037037037, 0.5994520547945206, 0.12, 0.1643835616438356], "'": [0.705962962962963, 0.5994520547945206, 0.12, 0.1643835616438356], 'backspace': [0.8551851851851852, 0.5994520547945206, 0.1361111111111111, 0.1643835616438356], 'letters': [0.007407407407407408, 0.8080821917808219, 0.125, 0.1643835616438356], 'smiley': [0.14351851851851852, 0.8080821917808219, 0.10277777777777777, 0.1643835616438356], 'spacebar': [0.25555555555555554, 0.8080821917808219, 0.48703703703703705, 0.1643835616438356], '.': [0.7546296296296297, 0.8080821917808219, 0.1, 0.1643835616438356], 'enter': [0.8648148148148148, 0.8080821917808219, 0.12962962962962962, 0.1643835616438356]}}

gesture.py

MAX_RADIUS = 16

MIN_N_POINTS_PER_DIST = 0.1

MAX_N_POINTS_PER_DIST = 0.25

MIN_ACCELERATION = 0.2

MAX_ACCELERATION = 0.5

layout.py

SPACE = 'spacebar'

POINT = '.'

N_ACCENT_PER_LINE = 4

noise_model.py

DEFAULT_TYPO_PROBS = {Typo.TRANSPOSE_CHAR: 0.01, Typo.DELETE_SPELLING_SYMBOL: 0.1, Typo.ADD_SPELLING_SYMBOL: 0, Typo.DELETE_SPACE: 0.01, Typo.ADD_SPACE: 0, Typo.DELETE_PUNCTUATION: 0, Typo.ADD_PUNCTUATION: 0, Typo.DELETE_CHAR: 0.005, Typo.ADD_CHAR: 0.005, Typo.SIMPLIFY_ACCENT: 0.08, Typo.SIMPLIFY_CASE: 0.08, Typo.COMMON_TYPO: 0.05}

SPACE = ' '

DELETIONS = [Typo.DELETE_SPELLING_SYMBOL, Typo.DELETE_SPACE, Typo.DELETE_PUNCTUATION, Typo.DELETE_CHAR]

FRONT_DELETION_MULTIPLIER = 0.36

DEFAULT_SIGMA_RATIO = 3

CACHE_DIR = os.path.expanduser('~/.cache/common_typos/')

TWEET_TYPO_CORPUS_URL = 'https://luululu.com/tweet/typo-corpus-r1.txt'

oracle.py

CHUNK_SIZE = 10

MAX_CHAR_PER_SENTENCE = 256

SWIPE_PROB = 0.01

scorer.py

DEFAULT_BETA = 0.9

WITH_TYPO = 'with_typo'

WITHOUT_TYPO = 'without_typo'

utils.py

SEC_TO_NANOSEC = 10000000000.0