colorful rat Ratfactor.com > Dave's Repos

hoot

Silly HTML game building engine
git clone http://ratfactor.com/repos/hoot/hoot.git

hoot/js/hoot-create-dictionary.html

Download raw file: js/hoot-create-dictionary.html

1 <html> 2 <body> 3 <script> 4 5 /* 6 7 This is a utility which was created and used to make the compression 8 dictionary which is used by the Hoot "build" output script compression 9 and decompression. 10 11 This is not used by Hoot in any way, but shall remain for posterity 12 because it's pretty cool. 13 14 It finds strings which repeat themselves in a corpus. Not WORDS, mind you, 15 STRINGS! :-) 16 17 Run this to see the neat output. You can also uncomment the other 18 document.writes in order to see the inner workings of the script. 19 20 Warning: it will absolutely blow up your browser if you give it input 21 which is too large. Start with something small and work your way up to 22 see what works for you. 23 24 Also, the smaller the max_likely_string_len value, the quicker it will 25 run, but it won't be able to catch any repeated strings longer than that 26 value, so fiddle with it all you like. There's no sense in making 27 max_likely_string_len any bigger than half of your total input string 28 because a string longer than half cannot possibly be repeated! 29 30 */ 31 32 33 34 // some Lord Tennyson to add some culture to this source: 35 var corpus = "Half a league, half a league, Half a league onward, All in the valley of Death Rode the six hundred. Forward the Light Brigade! Charge for the guns! he said. Into the valley of Death Rode the six hundred. Forward, the Light Brigade! Was there a man dismay'd? Not tho' the soldier knew Some one had blunder'd. Theirs not to make reply, Theirs not to reason why, Theirs but to do and die. Into the valley of Death Rode the six hundred. Cannon to right of them, Cannon to left of them, Cannon in front of them Volley'd and thunder'd; Storm'd at with shot and shell, Boldly they rode and well, Into the jaws of Death, Into the mouth of hell Rode the six hundred. Flash'd all their sabres bare, Flash'd as they turn'd in air Sabring the gunners there, Charging an army, while All the world wonder'd. Plunged in the battery-smoke Right thro' the line they broke; Cossack and Russian Reel'd from the sabre-stroke Shatter'd and sunder'd. Then they rode back, but not, Not the six hundred. Cannon to right of them, Cannon to left of them, Cannon behind them Volley'd and thunder'd; Storm'd at with shot and shell, While horse and hero fell, They that had fought so well Came thro' the jaws of Death, Back from the mouth of hell, All that was left of them, Left of six hundred. When can their glory fade? O the wild charge they made! All the world wonder'd. Honor the charge they made! Honor the Light Brigade, Noble six hundred!"; 36 37 // Settings: 38 var max_likely_string_len = 100; 39 var min_string_len = 3; 40 var min_frequency = 3; 41 42 var tree = []; // suffix tree 43 44 45 // make a suffix tree! 46 // ============================================================================= 47 48 for(var start_letter=0; start_letter<corpus.length; start_letter++){ 49 branch = tree; 50 //document.write(corpus.slice(start_letter,start_letter+max_likely_string_len)+"&nbsp; &nbsp; &nbsp; &nbsp; "); 51 for(var i=0; i<max_likely_string_len && start_letter+i < corpus.length; i++){ 52 c = corpus[start_letter+i]; 53 if(branch[c]){ 54 branch[c].count++; 55 } 56 else{ 57 branch[c] = {count:1, mybranch:{}}; 58 } 59 //document.write(c+"<sup>"+branch[c].count+"</sup>"); 60 branch = branch[c].mybranch; 61 } 62 //document.write("<br>"); 63 } 64 65 66 67 // grab the strings! 68 // ============================================================================= 69 70 var strings = {}; 71 72 function getStrings(tree, prevstr, freq){ 73 74 for(c in tree){ 75 str = prevstr 76 if(tree[c].count >= min_frequency){ 77 str+=c; 78 //document.write("<span style='background-color:#DDD'>"+prevstr+"<sup>"+freq+"</sup> "+str+"<sup>"+tree[c].count+"</sup></span> "); 79 getStrings(tree[c].mybranch, str, tree[c].count); 80 } 81 } 82 83 if(str.length>=min_string_len && strings[str] === undefined){ 84 strings[str]=freq; 85 //document.write("<span style='background-color:#BF7'>Done creating: "+str+"<sup>"+freq+"</sup></span><br>"); 86 } 87 88 // this if is strictly for document.write display of inner workings 89 //if(str.length<min_string_len){ 90 // document.write("<span style='background-color:#F99'>Too short, discarding: "+str+"</span><br>"); 91 //} 92 } 93 getStrings(tree, "", 0); 94 95 96 97 98 // discard smaller strings that are inside of bigger strings 99 // ============================================================================= 100 for(s1 in strings){ 101 for(s2 in strings){ 102 if( 103 s2 != s1 // they are NOT the same string 104 && strings[s2] == strings[s1] // they ARE the same freq 105 && s2.length > s1.length // s2 is larger 106 && s2.indexOf(s1) != -1){ // s2 contains s1 107 //document.write("<span style='background-color:#FACCFA'>'"+s1+"'<sup>"+strings[s1]+"</sup></span> is in <span style='background-color:#CCFACC'>'"+s2+"'<sup>"+strings[s2]+"</sup></span> "); 108 delete strings[s1]; // remove the smaller s1 109 break; 110 } 111 } 112 } 113 114 115 116 117 // sort them by freq 118 // ============================================================================= 119 function getSortedKeys(obj) { 120 var keys = []; for(var key in obj) keys.push(key); 121 return keys.sort(function(a,b){ return obj[b]-obj[a]; }); 122 } 123 var str_keys = getSortedKeys(strings); 124 125 126 127 128 // display them pretty! 129 // ============================================================================= 130 for(s in str_keys){ 131 132 color = "FFFFFF"; 133 if(strings[str_keys[s]] > 3){ color = "EDF9CB"; } // less 134 if(strings[str_keys[s]] > 4){ color = "E6F7B5"; } 135 if(strings[str_keys[s]] > 5){ color = "D6F67C"; } 136 if(strings[str_keys[s]] > 10){ color = "C3F53A"; } 137 if(strings[str_keys[s]] > 20){ color = "B7F312"; } // more 138 139 document.write("<div style='background-color:#"+color+";border:1px solid black; float: left; margin:.5em; padding:.5em;'>"+str_keys[s].replace(" ","<span style='color:#F0F;>'>_</span>")+"<sup style='background-color:#F9BFF5;border:1px solid black;margin:.5em;padding:.3em'>"+strings[str_keys[s]]+"</sup></div>"); 140 } 141 142 143 144 // display them ugly! (create a dictionary) 145 // ============================================================================= 146 document.write("<hr style='clear:left'>"); 147 document.write("<p style='font-family:monospace'>"); 148 for(s in str_keys){ 149 document.write("\""+str_keys[s].replace("\"","\\\"","g")+"\", "); 150 } 151 document.write("</p>"); 152 153 154 155 156 // ============================================================================= 157 // utils 158 // ============================================================================= 159 160 161 // find the common entries in two dictionaries 162 function intersection(a, b){ 163 document.write("<p style='font-family:monospace'><b>Common:</b> "); 164 for(var i=0; i<a.length; i++){ 165 for(var j=0; j<b.length; j++){ 166 if(a[i]==b[j]){ 167 document.write("\""+a[i].replace("\"","\\\"","g")+"\", "); 168 } 169 } 170 } 171 document.write("</p>"); 172 } 173 174 a = ["foo", "aaa"]; 175 b = ["foo", "bbb"]; 176 intersection(a,b); 177 178 179 // sort a list (most likely list of ) 180 function sortLongestToShortest(list){ 181 return list.sort(function(a,b){ return b.length-a.length; }); 182 } 183 184 list = ["aaaaaa", "b", "ccc", "ddddddddddddddddd"]; 185 186 sortLongestToShortest(list); 187 188 document.write("<p style='font-family:monospace'><b>Len sorted:</b> "); 189 for(var i=0; i<list.length; i++){ 190 document.write("\""+list[i].replace("\"","\\\"","g")+"\", "); 191 } 192 document.write("</p>"); 193 194 195 </script> 196 </body> 197 </html>