1 #!/usr/bin/env ruby
     2 require 'json'
     3 require 'set'
     4 
     5 # JSON file needs Emojibase-formatted entries, see https://emojibase.dev
     6 # Also see customizer.rb in this directory.
     7 
     8 json_in = ARGF.read
     9 
    10 my_emoji = JSON.parse(json_in)
    11 
    12 # Output target format example (taken from original dev test data)
    13 #
    14 #     FC.data = {
    15 #         groups: [
    16 #             { title: "People"  , emoji: "ð", range: [0,2] },
    17 #             { title: "Natural" , emoji: "ðī", range: [4,5] },
    18 #             { title: "Activity", emoji: "ð§", range: [6,6] },
    19 #             { title: "Things"  , emoji: "ðŧïļ", range: [0,0] },
    20 #         ],
    21 #         tags: {
    22 #             "face": [0,1,2,4],
    23 #             "wacky": [1],
    24 #             "cool": [2],
    25 #             "bear": [3,6],
    26 #             "pig": [4],
    27 #             "owl": [5],
    28 #             "animal": [3,4,5],
    29 #             "teddy": [6],
    30 #         },
    31 #         emoji: [
    32 #             "ð", // 0 grinning face
    33 #             "ðĪŠ", // 1 wacky face
    34 #             "ð", // 2 cool face with sunglasses
    35 #             "ðŧ", // 3 bear
    36 #             "ð·", // 4 pig face
    37 #             "ðĶ", // 5 owl
    38 #             "ð§ļ", // 6 teddy bear
    39 #         ],
    40 #     };
    41 
    42 # Begin!
    43 puts "FC.data = {"
    44 
    45 # Make group list. For screen real estate, I've combined some of the groups
    46 # together (original_groups). These groups become selectable "tab" filters in
    47 # the final interface.
    48 my_groups = [
    49   # Official group names:
    50   #     0  Smileys & Emotion
    51   #     1  People & Body
    52   #     2  Components
    53   #     3  Animals & Nature
    54   #     4  Food & Drink
    55   #     5  Travel & Places
    56   #     6  Activities
    57   #     7  Objects
    58   #     8  Symbols
    59   #     9  Flags
    60   { title: "People"  , emoji: "ð", from_groups: [0,1], range: [nil,0] },
    61   { title: "Natural" , emoji: "ðī", from_groups: [3,4], range: [nil,0] },
    62   { title: "Activity", emoji: "ð§", from_groups: [5,6], range: [nil,0] },
    63   { title: "Things"  , emoji: "ðŧïļ", from_groups: [7,8], range: [nil,0] },
    64 ]
    65 
    66 # Find first and last (range) emoji for each group
    67 my_emoji.each_with_index do |e, i|
    68   #puts "#{i} #{e["group"]}"
    69   # is this group one of the from_groups?
    70   g = my_groups.find { |g| g[:from_groups].include?(e["group"]) }
    71   if g
    72    # puts "#{i} vs #{g[:range][0]} - #{g[:range][1]}"
    73     if g[:range][0] === nil
    74       g[:range][0] = i # first!
    75     end
    76     if i > g[:range][1]
    77       g[:range][1] = i # maybe last
    78     end
    79   end
    80 end
    81 
    82 # Print groups (not just turning the whole thing over
    83 # to JSON.generate because I want to have explicit
    84 # control over the pretty-printing as a compactness vs.
    85 # readability balance. Since this output is really JS,
    86 # not strict JSON, I can have trailing commas and all
    87 # that good stuff, which simplifies things quite a bit.
    88 group_strs = []
    89 my_groups.each do |g|
    90   g.delete(:from_groups)
    91   group_strs.push "  #{JSON.generate(g)}"
    92 end
    93 puts "groups: [\n#{group_strs.join(",\n")}\n],"
    94 
    95 # Remove tags that are in an entry's label
    96 my_emoji.each do |e|
    97   new_tags = e["tags"].filter do |t|
    98     !e["label"].include?(t)
    99   end
   100   e["tags"] = new_tags
   101 end
   102 
   103 # Get all "word" usage counts from all tags and labels
   104 word_usage = {}
   105 my_emoji.each do |e|
   106   these_words = e["tags"].to_set
   107   these_words.merge e["label"].split(' ')
   108 
   109   these_words.each do |word|
   110     if word_usage.key?(word)
   111       word_usage[word] += 1
   112     else
   113       word_usage[word] = 1
   114     end
   115   end
   116 end
   117 
   118 
   119 # Word parameters to adjust for best results
   120 # Both of these will work with 1 or higher
   121 min_word_usage_count = 4
   122 min_word_length = 4
   123 
   124 if !ENV['MIN_WORD_USAGE_COUNT'].nil?
   125   min_word_usage_count = ENV['MIN_WORD_USAGE_COUNT'].to_i
   126 end
   127 
   128 if !ENV['MIN_WORD_LENGTH'].nil?
   129   min_word_length = ENV['MIN_WORD_LENGTH'].to_i
   130 end
   131 
   132 # TODO allow input of the above in ARGV to override the defaults so I can
   133 # automate the 25 or so permutations and get the output bytes for comparison to
   134 # see which one is smallest. THEN change the defaults to match!
   135     
   136 # Convert word list to array and enforce parameters
   137 word_usage_list = []
   138 word_usage.each do |word,count|
   139   if count >= min_word_usage_count &&
   140       word.length >= min_word_length
   141     word_usage_list.push [word,count]
   142   end
   143 end
   144 
   145 # Sort by usage count so more frequent words have lower index numbers
   146 # (literally just for the savings of a shorter number of digits)
   147 word_usage_list.sort_by! { |w| w[1] }.reverse!
   148 
   149 # Turn usage list into array (just the word (0th position))
   150 my_words = word_usage_list.map { |w| w[0] }
   151 
   152 line_len = 0
   153 first = true
   154 print "words: '"
   155 my_words.each_with_index do |w|
   156   if line_len + w.length > 70
   157     # don't add to a long line, start a new one
   158     print "'\n+'"
   159     line_len = 0
   160   end
   161   if first
   162     first = false
   163   else
   164     print ' '
   165   end
   166   print w
   167   line_len += w.length
   168 end
   169 puts "',"
   170 
   171 # Replace any words from list in labels with, e.g. $15, $256
   172 #
   173 # Surprisingly, there are NO tags or labels with '$' in them
   174 # (see check_for_dollar_tags.rb)
   175 my_labels = []
   176 my_emoji.each do |e|
   177   label_strings = []
   178   e['label'].split(' ').each do |word|
   179     idx = my_words.find_index(word)
   180     if idx.nil?
   181       # not in word list, push verbatim word
   182       label_strings.push(word)
   183     else
   184       label_strings.push("$#{idx}")
   185     end
   186   end
   187   my_labels.push label_strings.join(' ')
   188 end
   189 
   190 # Make tag reference list:
   191 #   - exclude if tag can be found in the label
   192 #   - in the word list: use number
   193 #   - not in word list: use verbatim string
   194 my_tags = []
   195 my_emoji.each do |e|
   196   these_tags = []
   197   e["tags"].each do |t|
   198     if e["label"].include?(t)
   199       puts "already in label: #{t}"
   200       next # it's already a word in the label
   201     end
   202     idx = my_words.find_index(t)
   203     if idx.nil?
   204       # not in word list, push verbatim tag
   205       these_tags.push(t)
   206     else
   207       these_tags.push("$#{idx}")
   208     end
   209   end
   210   my_tags.push these_tags.join(' ')
   211 end
   212 
   213 # Print emoji
   214 # Collate in the labels and tags.
   215 # As an array of arrays in this index order
   216 #   0: emoji glyph
   217 #   1: label string
   218 #   2: tag string
   219 # Example: ['X','winking $0',[2,'fart',17]]
   220 #
   221 line_len = 0
   222 puts "emoji: ["
   223 my_emoji.each_with_index do |e, i|
   224   str = "['#{e['emoji']}','#{my_labels[i]}','#{my_tags[i]}'],"
   225   if line_len + str.length > 80
   226     # don't add to a long line, start a new one
   227     puts
   228     line_len = 0
   229   end
   230   print str
   231   line_len += str.length
   232 end
   233 
   234 puts
   235 puts "]  // End of FC.data.emoji"
   236 puts "}; // End of FC.data"