Skip to content

Commit

Permalink
[Core] Add getreuer's Autocorrect feature to core (qmk#15699)
Browse files Browse the repository at this point in the history
Co-authored-by: Albert Y <[email protected]>
  • Loading branch information
drashna and filterpaper authored Sep 17, 2022
1 parent d67d388 commit fb29c0a
Show file tree
Hide file tree
Showing 17 changed files with 1,213 additions and 2 deletions.
1 change: 1 addition & 0 deletions builddefs/generic_features.mk
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ SPACE_CADET_ENABLE ?= yes
GRAVE_ESC_ENABLE ?= yes

GENERIC_FEATURES = \
AUTOCORRECT \
CAPS_WORD \
COMBO \
COMMAND \
Expand Down
3 changes: 2 additions & 1 deletion builddefs/show_options.mk
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ OTHER_OPTION_NAMES = \
LTO_ENABLE \
PROGRAMMABLE_BUTTON_ENABLE \
SECURE_ENABLE \
CAPS_WORD_ENABLE
CAPS_WORD_ENABLE \
AUTOCORRECT_ENABLE

define NAME_ECHO
@printf " %-30s = %-16s # %s\\n" "$1" "$($1)" "$(origin $1)"
Expand Down
1 change: 1 addition & 0 deletions docs/_summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@

* Software Features
* [Auto Shift](feature_auto_shift.md)
* [Autocorrect](feature_autocorrect.md)
* [Caps Word](feature_caps_word.md)
* [Combos](feature_combo.md)
* [Debounce API](feature_debounce_type.md)
Expand Down
295 changes: 295 additions & 0 deletions docs/feature_autocorrect.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/python/qmk/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
'qmk.cli.format.python',
'qmk.cli.format.text',
'qmk.cli.generate.api',
'qmk.cli.generate.autocorrect_data',
'qmk.cli.generate.compilation_database',
'qmk.cli.generate.config_h',
'qmk.cli.generate.develop_pr_list',
Expand Down
289 changes: 289 additions & 0 deletions lib/python/qmk/cli/generate/autocorrect_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Python program to make autocorrect_data.h.
This program reads from a prepared dictionary file and generates a C source file
"autocorrect_data.h" with a serialized trie embedded as an array. Run this
program and pass it as the first argument like:
$ qmk generate-autocorrect-data autocorrect_dict.txt
Each line of the dict file defines one typo and its correction with the syntax
"typo -> correction". Blank lines or lines starting with '#' are ignored.
Example:
:thier -> their
fitler -> filter
lenght -> length
ouput -> output
widht -> width
For full documentation, see QMK Docs
"""

import sys
import textwrap
from typing import Any, Dict, Iterator, List, Tuple

from milc import cli

import qmk.path
from qmk.keyboard import keyboard_completer, keyboard_folder
from qmk.keymap import keymap_completer, locate_keymap

KC_A = 4
KC_SPC = 0x2c
KC_QUOT = 0x34

TYPO_CHARS = dict([
("'", KC_QUOT),
(':', KC_SPC), # "Word break" character.
] + [(chr(c), c + KC_A - ord('a')) for c in range(ord('a'),
ord('z') + 1)]) # Characters a-z.


def parse_file(file_name: str) -> List[Tuple[str, str]]:
"""Parses autocorrections dictionary file.
Each line of the file defines one typo and its correction with the syntax
"typo -> correction". Blank lines or lines starting with '#' are ignored. The
function validates that typos only have characters a-z and that typos are not
substrings of other typos, otherwise the longer typo would never trigger.
Args:
file_name: String, path of the autocorrections dictionary.
Returns:
List of (typo, correction) tuples.
"""

try:
from english_words import english_words_lower_alpha_set as correct_words
except ImportError:
cli.echo('Autocorrection will falsely trigger when a typo is a substring of a correctly spelled word.')
cli.echo('To check for this, install the english_words package and rerun this script:')
cli.echo(' {fg_cyan}python3 -m pip install english_words')
# Use a minimal word list as a fallback.
correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood')

autocorrections = []
typos = set()
for line_number, typo, correction in parse_file_lines(file_name):
if typo in typos:
cli.log.warning('{fg_red}Error:%d:{fg_reset} Ignoring duplicate typo: "{fg_cyan}%s{fg_reset}"', line_number, typo)
continue

# Check that `typo` is valid.
if not (all([c in TYPO_CHARS for c in typo])):
cli.log.error('{fg_red}Error:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" has characters other than a-z, \' and :.', line_number, typo)
sys.exit(1)
for other_typo in typos:
if typo in other_typo or other_typo in typo:
cli.log.error('{fg_red}Error:%d:{fg_reset} Typos may not be substrings of one another, otherwise the longer typo would never trigger: "{fg_cyan}%s{fg_reset}" vs. "{fg_cyan}%s{fg_reset}".', line_number, typo, other_typo)
sys.exit(1)
if len(typo) < 5:
cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} It is suggested that typos are at least 5 characters long to avoid false triggers: "{fg_cyan}%s{fg_reset}"', line_number, typo)
if len(typo) > 127:
cli.log.error('{fg_red}Error:%d:{fg_reset} Typo exceeds 127 chars: "{fg_cyan}%s{fg_reset}"', line_number, typo)
sys.exit(1)

check_typo_against_dictionary(typo, line_number, correct_words)

autocorrections.append((typo, correction))
typos.add(typo)

return autocorrections


def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]:
"""Makes a trie from the the typos, writing in reverse.
Args:
autocorrections: List of (typo, correction) tuples.
Returns:
Dict of dict, representing the trie.
"""
trie = {}
for typo, correction in autocorrections:
node = trie
for letter in typo[::-1]:
node = node.setdefault(letter, {})
node['LEAF'] = (typo, correction)

return trie


def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]:
"""Parses lines read from `file_name` into typo-correction pairs."""

line_number = 0
for line in open(file_name, 'rt'):
line_number += 1
line = line.strip()
if line and line[0] != '#':
# Parse syntax "typo -> correction", using strip to ignore indenting.
tokens = [token.strip() for token in line.split('->', 1)]
if len(tokens) != 2 or not tokens[0]:
print(f'Error:{line_number}: Invalid syntax: "{line}"')
sys.exit(1)

typo, correction = tokens
typo = typo.lower() # Force typos to lowercase.
typo = typo.replace(' ', ':')

yield line_number, typo, correction


def check_typo_against_dictionary(typo: str, line_number: int, correct_words) -> None:
"""Checks `typo` against English dictionary words."""

if typo.startswith(':') and typo.endswith(':'):
if typo[1:-1] in correct_words:
cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" is a correctly spelled dictionary word.', line_number, typo)
elif typo.startswith(':') and not typo.endswith(':'):
for word in correct_words:
if word.startswith(typo[1:]):
cli.log.warning('{fg_yellow}Warning:%d: {fg_reset}Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)
elif not typo.startswith(':') and typo.endswith(':'):
for word in correct_words:
if word.endswith(typo[:-1]):
cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)
elif not typo.startswith(':') and not typo.endswith(':'):
for word in correct_words:
if typo in word:
cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)


def serialize_trie(autocorrections: List[Tuple[str, str]], trie: Dict[str, Any]) -> List[int]:
"""Serializes trie and correction data in a form readable by the C code.
Args:
autocorrections: List of (typo, correction) tuples.
trie: Dict of dicts.
Returns:
List of ints in the range 0-255.
"""
table = []

# Traverse trie in depth first order.
def traverse(trie_node):
if 'LEAF' in trie_node: # Handle a leaf trie node.
typo, correction = trie_node['LEAF']
word_boundary_ending = typo[-1] == ':'
typo = typo.strip(':')
i = 0 # Make the autocorrection data for this entry and serialize it.
while i < min(len(typo), len(correction)) and typo[i] == correction[i]:
i += 1
backspaces = len(typo) - i - 1 + word_boundary_ending
assert 0 <= backspaces <= 63
correction = correction[i:]
bs_count = [backspaces + 128]
data = bs_count + list(bytes(correction, 'ascii')) + [0]

entry = {'data': data, 'links': [], 'byte_offset': 0}
table.append(entry)
elif len(trie_node) == 1: # Handle trie node with a single child.
c, trie_node = next(iter(trie_node.items()))
entry = {'chars': c, 'byte_offset': 0}

# It's common for a trie to have long chains of single-child nodes. We
# find the whole chain so that we can serialize it more efficiently.
while len(trie_node) == 1 and 'LEAF' not in trie_node:
c, trie_node = next(iter(trie_node.items()))
entry['chars'] += c

table.append(entry)
entry['links'] = [traverse(trie_node)]
else: # Handle trie node with multiple children.
entry = {'chars': ''.join(sorted(trie_node.keys())), 'byte_offset': 0}
table.append(entry)
entry['links'] = [traverse(trie_node[c]) for c in entry['chars']]
return entry

traverse(trie)

def serialize(e: Dict[str, Any]) -> List[int]:
if not e['links']: # Handle a leaf table entry.
return e['data']
elif len(e['links']) == 1: # Handle a chain table entry.
return [TYPO_CHARS[c] for c in e['chars']] + [0] # + encode_link(e['links'][0]))
else: # Handle a branch table entry.
data = []
for c, link in zip(e['chars'], e['links']):
data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link)
return data + [0]

byte_offset = 0
for e in table: # To encode links, first compute byte offset of each entry.
e['byte_offset'] = byte_offset
byte_offset += len(serialize(e))
assert 0 <= byte_offset <= 0xffff

return [b for e in table for b in serialize(e)] # Serialize final table.


def encode_link(link: Dict[str, Any]) -> List[int]:
"""Encodes a node link as two bytes."""
byte_offset = link['byte_offset']
if not (0 <= byte_offset <= 0xffff):
cli.log.error('{fg_red}Error:{fg_reset} The autocorrection table is too large, a node link exceeds 64KB limit. Try reducing the autocorrection dict to fewer entries.')
sys.exit(1)
return [byte_offset & 255, byte_offset >> 8]


def write_generated_code(autocorrections: List[Tuple[str, str]], data: List[int], file_name: str) -> None:
"""Writes autocorrection data as generated C code to `file_name`.
Args:
autocorrections: List of (typo, correction) tuples.
data: List of ints in 0-255, the serialized trie.
file_name: String, path of the output C file.
"""
assert all(0 <= b <= 255 for b in data)

def typo_len(e: Tuple[str, str]) -> int:
return len(e[0])

min_typo = min(autocorrections, key=typo_len)[0]
max_typo = max(autocorrections, key=typo_len)[0]
generated_code = ''.join([
'// Generated code.\n\n', f'// Autocorrection dictionary ({len(autocorrections)} entries):\n', ''.join(sorted(f'// {typo:<{len(max_typo)}} -> {correction}\n' for typo, correction in autocorrections)),
f'\n#define AUTOCORRECT_MIN_LENGTH {len(min_typo)} // "{min_typo}"\n', f'#define AUTOCORRECT_MAX_LENGTH {len(max_typo)} // "{max_typo}"\n\n', f'#define DICTIONARY_SIZE {len(data)}\n\n',
textwrap.fill('static const uint8_t autocorrect_data[DICTIONARY_SIZE] PROGMEM = {%s};' % (', '.join(map(str, data))), width=120, subsequent_indent=' '), '\n\n'
])

with open(file_name, 'wt') as f:
f.write(generated_code)


@cli.argument('filename', default='autocorrect_dict.txt', help='The autocorrection database file')
@cli.argument('-kb', '--keyboard', type=keyboard_folder, completer=keyboard_completer, help='The keyboard to build a firmware for. Ignored when a configurator export is supplied.')
@cli.argument('-km', '--keymap', completer=keymap_completer, help='The keymap to build a firmware for. Ignored when a configurator export is supplied.')
@cli.argument('-o', '--output', arg_only=True, type=qmk.path.normpath, help='File to write to')
@cli.subcommand('Generate the autocorrection data file from a dictionary file.')
def generate_autocorrect_data(cli):
autocorrections = parse_file(cli.args.filename)
trie = make_trie(autocorrections)
data = serialize_trie(autocorrections, trie)
# Environment processing
if cli.args.output == '-':
cli.args.output = None

if cli.args.output:
cli.args.output.parent.mkdir(parents=True, exist_ok=True)
cli.log.info('Creating autocorrect database at {fg_cyan}%s', cli.args.output)
write_generated_code(autocorrections, data, cli.args.output)

else:
current_keyboard = cli.args.keyboard or cli.config.user.keyboard or cli.config.generate_autocorrect_data.keyboard
current_keymap = cli.args.keymap or cli.config.user.keymap or cli.config.generate_autocorrect_data.keymap

if current_keyboard and current_keymap:
filename = locate_keymap(current_keyboard, current_keymap).parent / 'autocorrect_data.h'
cli.log.info('Creating autocorrect database at {fg_cyan}%s', filename)
write_generated_code(autocorrections, data, filename)

else:
write_generated_code(autocorrections, data, 'autocorrect_data.h')

cli.log.info('Processed %d autocorrection entries to table with %d bytes.', len(autocorrections), len(data))
2 changes: 1 addition & 1 deletion quantum/eeconfig.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ void eeconfig_init_quantum(void) {
eeprom_update_byte(EECONFIG_DEFAULT_LAYER, 0);
default_layer_state = 0;
eeprom_update_byte(EECONFIG_KEYMAP_LOWER_BYTE, 0);
eeprom_update_byte(EECONFIG_KEYMAP_UPPER_BYTE, 0x4);
eeprom_update_byte(EECONFIG_KEYMAP_UPPER_BYTE, 0xC);
eeprom_update_byte(EECONFIG_MOUSEKEY_ACCEL, 0);
eeprom_update_byte(EECONFIG_BACKLIGHT, 0);
eeprom_update_byte(EECONFIG_AUDIO, 0xFF); // On by default
Expand Down
1 change: 1 addition & 0 deletions quantum/keycode_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ typedef union {
bool swap_rctl_rgui : 1;
bool oneshot_enable : 1;
bool swap_escape_capslock : 1;
bool autocorrect_enable : 1;
};
} keymap_config_t;

Expand Down
Loading

0 comments on commit fb29c0a

Please sign in to comment.