aboutsummaryrefslogtreecommitdiffstats
path: root/regexp/parse-unidata.awk
diff options
context:
space:
mode:
Diffstat (limited to 'regexp/parse-unidata.awk')
-rw-r--r--regexp/parse-unidata.awk41
1 files changed, 41 insertions, 0 deletions
diff --git a/regexp/parse-unidata.awk b/regexp/parse-unidata.awk
new file mode 100644
index 000000000..c293c7ecf
--- /dev/null
+++ b/regexp/parse-unidata.awk
@@ -0,0 +1,41 @@
+# Parse the unicode data from:
+# https://unicode.org/Public/UNIDATA/UnicodeData.txt
+# to generate case mapping table
+
+BEGIN {
+ print("/* Generated from UnicodeData.txt */")
+ print("")
+ print("static const struct casemap unicode_case_mapping_upper[] = {")
+ FS = ";"
+ count = 0
+}
+
+{
+ code = strtonum(("0x" $1))
+ name = $2
+ class = $3
+ upper = $13
+ lower = $14
+ title = $15
+
+ if (code <= 0x7f) {
+ next
+ }
+ if (code > 0xffff) {
+ next
+ }
+ if ($3 !~ /^L.*/) {
+ next
+ }
+ if (upper != "") {
+ printf("\t{ 0x" tolower($1) ", 0x" tolower(upper) " },")
+ count++
+ if ((count % 4) == 0) {
+ print("")
+ }
+ }
+}
+
+END {
+ print("\n};")
+}