1 <html>
2 <body>
3 <script>
4
5 /*
6
7 This is a utility which was created and used to make the compression
8 dictionary which is used by the Hoot "build" output script compression
9 and decompression.
10
11 This is not used by Hoot in any way, but shall remain for posterity
12 because it's pretty cool.
13
14 It finds strings which repeat themselves in a corpus. Not WORDS, mind you,
15 STRINGS! :-)
16
17 Run this to see the neat output. You can also uncomment the other
18 document.writes in order to see the inner workings of the script.
19
20 Warning: it will absolutely blow up your browser if you give it input
21 which is too large. Start with something small and work your way up to
22 see what works for you.
23
24 Also, the smaller the max_likely_string_len value, the quicker it will
25 run, but it won't be able to catch any repeated strings longer than that
26 value, so fiddle with it all you like. There's no sense in making
27 max_likely_string_len any bigger than half of your total input string
28 because a string longer than half cannot possibly be repeated!
29
30 */
31
32
33
34 // some Lord Tennyson to add some culture to this source:
35 var corpus = "Half a league, half a league, Half a league onward, All in the valley of Death Rode the six hundred. Forward the Light Brigade! Charge for the guns! he said. Into the valley of Death Rode the six hundred. Forward, the Light Brigade! Was there a man dismay'd? Not tho' the soldier knew Some one had blunder'd. Theirs not to make reply, Theirs not to reason why, Theirs but to do and die. Into the valley of Death Rode the six hundred. Cannon to right of them, Cannon to left of them, Cannon in front of them Volley'd and thunder'd; Storm'd at with shot and shell, Boldly they rode and well, Into the jaws of Death, Into the mouth of hell Rode the six hundred. Flash'd all their sabres bare, Flash'd as they turn'd in air Sabring the gunners there, Charging an army, while All the world wonder'd. Plunged in the battery-smoke Right thro' the line they broke; Cossack and Russian Reel'd from the sabre-stroke Shatter'd and sunder'd. Then they rode back, but not, Not the six hundred. Cannon to right of them, Cannon to left of them, Cannon behind them Volley'd and thunder'd; Storm'd at with shot and shell, While horse and hero fell, They that had fought so well Came thro' the jaws of Death, Back from the mouth of hell, All that was left of them, Left of six hundred. When can their glory fade? O the wild charge they made! All the world wonder'd. Honor the charge they made! Honor the Light Brigade, Noble six hundred!";
36
37 // Settings:
38 var max_likely_string_len = 100;
39 var min_string_len = 3;
40 var min_frequency = 3;
41
42 var tree = []; // suffix tree
43
44
45 // make a suffix tree!
46 // =============================================================================
47
48 for(var start_letter=0; start_letter<corpus.length; start_letter++){
49 branch = tree;
50 //document.write(corpus.slice(start_letter,start_letter+max_likely_string_len)+" ");
51 for(var i=0; i<max_likely_string_len && start_letter+i < corpus.length; i++){
52 c = corpus[start_letter+i];
53 if(branch[c]){
54 branch[c].count++;
55 }
56 else{
57 branch[c] = {count:1, mybranch:{}};
58 }
59 //document.write(c+"<sup>"+branch[c].count+"</sup>");
60 branch = branch[c].mybranch;
61 }
62 //document.write("<br>");
63 }
64
65
66
67 // grab the strings!
68 // =============================================================================
69
70 var strings = {};
71
72 function getStrings(tree, prevstr, freq){
73
74 for(c in tree){
75 str = prevstr
76 if(tree[c].count >= min_frequency){
77 str+=c;
78 //document.write("<span style='background-color:#DDD'>"+prevstr+"<sup>"+freq+"</sup> "+str+"<sup>"+tree[c].count+"</sup></span> ");
79 getStrings(tree[c].mybranch, str, tree[c].count);
80 }
81 }
82
83 if(str.length>=min_string_len && strings[str] === undefined){
84 strings[str]=freq;
85 //document.write("<span style='background-color:#BF7'>Done creating: "+str+"<sup>"+freq+"</sup></span><br>");
86 }
87
88 // this if is strictly for document.write display of inner workings
89 //if(str.length<min_string_len){
90 // document.write("<span style='background-color:#F99'>Too short, discarding: "+str+"</span><br>");
91 //}
92 }
93 getStrings(tree, "", 0);
94
95
96
97
98 // discard smaller strings that are inside of bigger strings
99 // =============================================================================
100 for(s1 in strings){
101 for(s2 in strings){
102 if(
103 s2 != s1 // they are NOT the same string
104 && strings[s2] == strings[s1] // they ARE the same freq
105 && s2.length > s1.length // s2 is larger
106 && s2.indexOf(s1) != -1){ // s2 contains s1
107 //document.write("<span style='background-color:#FACCFA'>'"+s1+"'<sup>"+strings[s1]+"</sup></span> is in <span style='background-color:#CCFACC'>'"+s2+"'<sup>"+strings[s2]+"</sup></span> ");
108 delete strings[s1]; // remove the smaller s1
109 break;
110 }
111 }
112 }
113
114
115
116
117 // sort them by freq
118 // =============================================================================
119 function getSortedKeys(obj) {
120 var keys = []; for(var key in obj) keys.push(key);
121 return keys.sort(function(a,b){ return obj[b]-obj[a]; });
122 }
123 var str_keys = getSortedKeys(strings);
124
125
126
127
128 // display them pretty!
129 // =============================================================================
130 for(s in str_keys){
131
132 color = "FFFFFF";
133 if(strings[str_keys[s]] > 3){ color = "EDF9CB"; } // less
134 if(strings[str_keys[s]] > 4){ color = "E6F7B5"; }
135 if(strings[str_keys[s]] > 5){ color = "D6F67C"; }
136 if(strings[str_keys[s]] > 10){ color = "C3F53A"; }
137 if(strings[str_keys[s]] > 20){ color = "B7F312"; } // more
138
139 document.write("<div style='background-color:#"+color+";border:1px solid black; float: left; margin:.5em; padding:.5em;'>"+str_keys[s].replace(" ","<span style='color:#F0F;>'>_</span>")+"<sup style='background-color:#F9BFF5;border:1px solid black;margin:.5em;padding:.3em'>"+strings[str_keys[s]]+"</sup></div>");
140 }
141
142
143
144 // display them ugly! (create a dictionary)
145 // =============================================================================
146 document.write("<hr style='clear:left'>");
147 document.write("<p style='font-family:monospace'>");
148 for(s in str_keys){
149 document.write("\""+str_keys[s].replace("\"","\\\"","g")+"\", ");
150 }
151 document.write("</p>");
152
153
154
155
156 // =============================================================================
157 // utils
158 // =============================================================================
159
160
161 // find the common entries in two dictionaries
162 function intersection(a, b){
163 document.write("<p style='font-family:monospace'><b>Common:</b> ");
164 for(var i=0; i<a.length; i++){
165 for(var j=0; j<b.length; j++){
166 if(a[i]==b[j]){
167 document.write("\""+a[i].replace("\"","\\\"","g")+"\", ");
168 }
169 }
170 }
171 document.write("</p>");
172 }
173
174 a = ["foo", "aaa"];
175 b = ["foo", "bbb"];
176 intersection(a,b);
177
178
179 // sort a list (most likely list of )
180 function sortLongestToShortest(list){
181 return list.sort(function(a,b){ return b.length-a.length; });
182 }
183
184 list = ["aaaaaa", "b", "ccc", "ddddddddddddddddd"];
185
186 sortLongestToShortest(list);
187
188 document.write("<p style='font-family:monospace'><b>Len sorted:</b> ");
189 for(var i=0; i<list.length; i++){
190 document.write("\""+list[i].replace("\"","\\\"","g")+"\", ");
191 }
192 document.write("</p>");
193
194
195 </script>
196 </body>
197 </html>