nheko/scripts/emoji_codegen.py

138 lines
5.0 KiB
Python
Raw Normal View History

2017-04-23 20:31:08 +02:00
#!/usr/bin/env python3
import sys
2020-01-24 04:18:14 +01:00
import re
from unidecode import unidecode
2017-04-23 20:31:08 +02:00
from jinja2 import Template
class Emoji(object):
2022-04-22 18:49:57 +02:00
def __init__(self, code, shortname, unicodename):
self.code = ''.join(['\\U'+c.rjust(8, '0') for c in code.strip().split(' ')])
2017-04-23 20:31:08 +02:00
self.shortname = shortname
2022-04-22 18:49:57 +02:00
self.unicodename = unicodename
2017-04-23 20:31:08 +02:00
def generate_qml_list(**kwargs):
entrycount = sum([len(c[1]) for c in kwargs.items()])
tmpl = Template('''
constexpr std::array<Emoji, {{ entrycount }} > emoji::Provider::emoji = {
{%- for c in kwargs.items() %}
// {{ c[0].capitalize() }}
{%- for e in c[1] %}
Emoji{null_literal(u"{{ e.code }}"), null_literal(u"{{ e.shortname }}"), null_literal(u"{{ e.unicodename }}"), emoji::Emoji::Category::{{ c[0].capitalize() }}},
{%- endfor %}
{%- endfor %}
};
''')
d = dict(kwargs=kwargs, entrycount=entrycount)
print(tmpl.render(d))
2017-04-23 20:31:08 +02:00
if __name__ == '__main__':
if len(sys.argv) < 3:
print('usage: emoji_codegen.py /path/to/emoji-test.txt /path/to/shortcodes.txt')
2017-04-23 20:31:08 +02:00
sys.exit(1)
filename = sys.argv[1]
shortcodefilename = sys.argv[2]
2017-04-23 20:31:08 +02:00
2020-01-24 04:18:14 +01:00
people = []
nature = []
food = []
activity = []
travel = []
objects = []
symbols = []
flags = []
categories = {
'Smileys & Emotion': people,
'People & Body': people,
'Animals & Nature': nature,
'Food & Drink': food,
'Travel & Places': travel,
'Activities': activity,
'Objects': objects,
'Symbols': symbols,
2022-04-24 18:14:23 +02:00
'Flags': flags,
'Component': symbols
2020-01-24 04:18:14 +01:00
}
shortcodeDict = {}
# for my sanity - this strips newlines
for line in open(shortcodefilename, 'r', encoding="utf8"):
longname, shortname = line.strip().split(':')
shortcodeDict[longname] = shortname
2020-01-24 04:18:14 +01:00
current_category = ''
2021-09-25 08:19:44 +02:00
for line in open(filename, 'r', encoding="utf8"):
2020-01-24 04:18:14 +01:00
if line.startswith('# group:'):
current_category = line.split(':', 1)[1].strip()
if not line or line.startswith('#'):
continue
2017-04-23 20:31:08 +02:00
2020-01-24 04:18:14 +01:00
segments = re.split(r'\s+[#;] ', line.strip())
if len(segments) != 3:
continue
2017-04-23 20:31:08 +02:00
2020-01-24 04:18:14 +01:00
code, qualification, charAndName = segments
2017-04-23 20:31:08 +02:00
# skip unqualified versions of same unicode
2022-04-27 18:45:45 +02:00
if qualification != 'fully-qualified':
2020-01-24 04:18:14 +01:00
continue
2017-04-23 20:31:08 +02:00
char, name = re.match(r'^(\S+) E\d+\.\d+ (.*)$', charAndName).groups()
2022-04-22 18:49:57 +02:00
shortname = name
2022-04-27 18:45:45 +02:00
# until skin tone is handled, keep them around
2022-05-06 16:14:47 +02:00
## discard skin tone variants for sanity
# if "skin tone" in name and qualification != 'component':
2022-04-27 18:45:45 +02:00
# continue
2022-05-06 16:14:47 +02:00
# if qualification == 'component' and not "skin tone" in name:
2022-04-27 18:45:45 +02:00
# continue
#TODO: Handle skintone modifiers in a sane way
2022-04-27 18:45:45 +02:00
basicallyTheSame = False
2022-05-06 16:14:47 +02:00
if code in shortcodeDict:
2022-04-27 18:45:45 +02:00
shortname = shortcodeDict[code]
2022-04-24 18:14:23 +02:00
else:
shortname = shortname.lower()
2022-05-06 16:14:47 +02:00
if shortname.endswith(' (blood type)'):
2022-04-24 18:14:23 +02:00
shortname = shortname[:-13]
2022-05-06 16:14:47 +02:00
if shortname.endswith(': red hair'):
2022-04-24 18:14:23 +02:00
shortname = "red_haired_" + shortname[:-10]
2022-05-06 16:14:47 +02:00
if shortname.endswith(': curly hair'):
2022-04-24 18:14:23 +02:00
shortname = "curly_haired_" + shortname[:-12]
2022-05-06 16:14:47 +02:00
if shortname.endswith(': white hair'):
2022-04-24 18:14:23 +02:00
shortname = "white_haried_" + shortname[:-12]
2022-05-06 16:14:47 +02:00
if shortname.endswith(': bald'):
2022-04-24 18:14:23 +02:00
shortname = "bald_" + shortname[:-6]
2022-05-06 16:14:47 +02:00
if shortname.endswith(': beard'):
2022-04-24 18:14:23 +02:00
shortname = "bearded_" + shortname[:-7]
2022-05-06 16:14:47 +02:00
if shortname.endswith(' face'):
2022-04-22 18:49:57 +02:00
shortname = shortname[:-5]
2022-05-06 16:14:47 +02:00
if shortname.endswith(' button'):
shortname = shortname[:-7]
if shortname.endswith(' banknote'):
2022-04-24 18:14:23 +02:00
shortname = shortname[:-9]
2022-05-06 16:14:47 +02:00
2022-04-24 18:14:23 +02:00
# FIXME: Is there a better way to do this?
2022-05-06 16:14:47 +02:00
matchobj = re.match(r'^flag: (.*)$', shortname)
if shortname.startswith("flag: "):
2022-04-27 18:45:45 +02:00
country = shortname[5:]
2022-04-24 18:14:23 +02:00
shortname = country + " flag"
shortname = shortname.replace("u.s.", "us")
shortname = shortname.replace("&", "and")
2022-05-06 16:14:47 +02:00
if shortname == name.lower():
2022-04-27 18:45:45 +02:00
basicallyTheSame = True
2022-04-22 18:49:57 +02:00
shortname = shortname.replace("-", "_")
2022-04-27 18:45:45 +02:00
shortname = re.sub(r'\W', '_', shortname)
2022-04-24 18:14:23 +02:00
shortname, = re.match(r'^_*(.+)_*$', shortname).groups()
2022-05-06 16:14:47 +02:00
shortname = re.sub(r'_{2,}', '_', shortname)
2022-04-22 18:49:57 +02:00
shortname = unidecode(shortname)
2022-04-27 18:45:45 +02:00
# if basicallyTheSame:
# shortname = ""
2022-04-22 18:49:57 +02:00
categories[current_category].append(Emoji(code, shortname, name))
2017-04-23 20:31:08 +02:00
# Use xclip to pipe the output to clipboard.
2022-05-06 16:14:47 +02:00
# e.g ./emoji_codegen.py emoji.json | xclip -sel clip
# alternatively - delete the var from src/emoji/Provider.cpp, and do ./codegen.sh emojis shortcodes >> ../src/emoji/Provider.cpp
2021-01-23 23:25:52 +01:00
generate_qml_list(people=people, nature=nature, food=food, activity=activity, travel=travel, objects=objects, symbols=symbols, flags=flags)