1 <html>
     
2 <body>
     
3 <script>
     
4 
     5 /*
     
6 
     7 This is a utility which was created and used to make the compression
     
8 dictionary which is used by the Hoot "build" output script compression
     
9 and decompression.
    
10 
    11 This is not used by Hoot in any way, but shall remain for posterity
    
12 because it's pretty cool.
    
13 
    14 It finds strings which repeat themselves in a corpus. Not WORDS, mind you, 
    
15 STRINGS!  :-)
    
16 
    17 Run this to see the neat output.  You can also uncomment the other
    
18 document.writes in order to see the inner workings of the script.
    
19 
    20 Warning: it will absolutely blow up your browser if you give it input
    
21 which is too large.  Start with something small and work your way up to
    
22 see what works for you.
    
23 
    24 Also, the smaller the max_likely_string_len value, the quicker it will
    
25 run, but it won't be able to catch any repeated strings longer than that
    
26 value, so fiddle with it all you like.  There's no sense in making
    
27 max_likely_string_len any bigger than half of your total input string
    
28 because a string longer than half cannot possibly be repeated!
    
29 
    30 */
    
31 
    32 
    33 
    34 // some Lord Tennyson to add some culture to this source:
    
35 var corpus = "Half a league, half a league, Half a league onward, All in the valley of Death Rode the six hundred. Forward the Light Brigade! Charge for the guns! he said. Into the valley of Death Rode the six hundred. Forward, the Light Brigade! Was there a man dismay'd? Not tho' the soldier knew Some one had blunder'd. Theirs not to make reply, Theirs not to reason why, Theirs but to do and die. Into the valley of Death Rode the six hundred. Cannon to right of them, Cannon to left of them, Cannon in front of them Volley'd and thunder'd; Storm'd at with shot and shell, Boldly they rode and well, Into the jaws of Death, Into the mouth of hell Rode the six hundred. Flash'd all their sabres bare, Flash'd as they turn'd in air Sabring the gunners there, Charging an army, while All the world wonder'd. Plunged in the battery-smoke Right thro' the line they broke; Cossack and Russian Reel'd from the sabre-stroke Shatter'd and sunder'd. Then they rode back, but not, Not the six hundred. Cannon to right of them, Cannon to left of them, Cannon behind them Volley'd and thunder'd; Storm'd at with shot and shell, While horse and hero fell, They that had fought so well Came thro' the jaws of Death, Back from the mouth of hell, All that was left of them, Left of six hundred. When can their glory fade? O the wild charge they made! All the world wonder'd. Honor the charge they made! Honor the Light Brigade, Noble six hundred!";
    
36 
    37 // Settings:
    
38 var max_likely_string_len = 100;
    
39 var min_string_len = 3;
    
40 var min_frequency = 3;
    
41 
    42 var tree = []; // suffix tree
    
43 
    44 
    45 // make a suffix tree!
    
46 // =============================================================================
    
47 
    48 for(var start_letter=0; start_letter<corpus.length; start_letter++){
    
49     branch = tree;
    
50     //document.write(corpus.slice(start_letter,start_letter+max_likely_string_len)+"        ");
    
51     for(var i=0; i<max_likely_string_len && start_letter+i < corpus.length; i++){
    
52         c = corpus[start_letter+i];
    
53         if(branch[c]){
    
54             branch[c].count++;
    
55         }
    
56         else{
    
57             branch[c] = {count:1, mybranch:{}};
    
58         }
    
59         //document.write(c+"<sup>"+branch[c].count+"</sup>");
    
60         branch = branch[c].mybranch;
    
61     }
    
62     //document.write("<br>");
    
63 }
    
64 
    65 
    66 
    67 // grab the strings!
    
68 // =============================================================================
    
69 
    70 var strings = {};
    
71 
    72 function getStrings(tree, prevstr, freq){
    
73 
    74     for(c in tree){
    
75         str = prevstr
    
76         if(tree[c].count >= min_frequency){
    
77             str+=c;
    
78             //document.write("<span style='background-color:#DDD'>"+prevstr+"<sup>"+freq+"</sup> "+str+"<sup>"+tree[c].count+"</sup></span> ");
    
79             getStrings(tree[c].mybranch, str, tree[c].count);
    
80         }
    
81     }
    
82 
    83     if(str.length>=min_string_len && strings[str] === undefined){
    
84         strings[str]=freq;
    
85         //document.write("<span style='background-color:#BF7'>Done creating: "+str+"<sup>"+freq+"</sup></span><br>");
    
86     }
    
87 
    88     // this if is strictly for document.write display of inner workings
    
89     //if(str.length<min_string_len){
    
90     //    document.write("<span style='background-color:#F99'>Too short, discarding: "+str+"</span><br>");
    
91     //}
    
92 }
    
93 getStrings(tree, "", 0);
    
94 
    95 
    96 
    97 
    98 // discard smaller strings that are inside of bigger strings
    
99 // =============================================================================
   
100 for(s1 in strings){
   
101     for(s2 in strings){
   
102         if(
   
103             s2 != s1                      // they are NOT the same string
   
104             && strings[s2] == strings[s1] // they ARE the same freq
   
105             && s2.length > s1.length      // s2 is larger
   
106             && s2.indexOf(s1) != -1){     // s2 contains s1
   
107             //document.write("<span style='background-color:#FACCFA'>'"+s1+"'<sup>"+strings[s1]+"</sup></span> is in <span style='background-color:#CCFACC'>'"+s2+"'<sup>"+strings[s2]+"</sup></span> ");
   
108             delete strings[s1];           // remove the smaller s1
   
109             break;
   
110         }
   
111     }
   
112 }
   
113 
   114 
   115 
   116 
   117 // sort them by freq
   
118 // =============================================================================
   
119 function getSortedKeys(obj) {
   
120     var keys = []; for(var key in obj) keys.push(key);
   
121     return keys.sort(function(a,b){ return obj[b]-obj[a]; });
   
122 }
   
123 var str_keys = getSortedKeys(strings);
   
124 
   125 
   126 
   127 
   128 // display them pretty!
   
129 // =============================================================================
   
130 for(s in str_keys){
   
131 
   132     color = "FFFFFF";
   
133     if(strings[str_keys[s]] > 3){  color = "EDF9CB"; } // less
   
134     if(strings[str_keys[s]] > 4){ color = "E6F7B5"; }
   
135     if(strings[str_keys[s]] > 5){ color = "D6F67C"; }
   
136     if(strings[str_keys[s]] > 10){ color = "C3F53A"; }
   
137     if(strings[str_keys[s]] > 20){ color = "B7F312"; } // more
   
138 
   139     document.write("<div style='background-color:#"+color+";border:1px solid black; float: left; margin:.5em; padding:.5em;'>"+str_keys[s].replace(" ","<span style='color:#F0F;>'>_</span>")+"<sup style='background-color:#F9BFF5;border:1px solid black;margin:.5em;padding:.3em'>"+strings[str_keys[s]]+"</sup></div>");
   
140 }
   
141 
   142 
   143 
   144 // display them ugly! (create a dictionary)
   
145 // =============================================================================
   
146 document.write("<hr style='clear:left'>");
   
147 document.write("<p style='font-family:monospace'>");
   
148 for(s in str_keys){
   
149     document.write("\""+str_keys[s].replace("\"","\\\"","g")+"\", ");
   
150 }
   
151 document.write("</p>");
   
152 
   153 
   154 
   155 
   156 // =============================================================================
   
157 // utils
   
158 // =============================================================================
   
159 
   160 
   161 // find the common entries in two dictionaries
   
162 function intersection(a, b){
   
163     document.write("<p style='font-family:monospace'><b>Common:</b> ");
   
164     for(var i=0; i<a.length; i++){
   
165         for(var j=0; j<b.length; j++){
   
166             if(a[i]==b[j]){
   
167                 document.write("\""+a[i].replace("\"","\\\"","g")+"\", ");
   
168             }
   
169         }
   
170     }
   
171     document.write("</p>");
   
172 }
   
173 
   174 a = ["foo", "aaa"];
   
175 b = ["foo", "bbb"];
   
176 intersection(a,b);
   
177 
   178 
   179 // sort a list (most likely list of )
   
180 function sortLongestToShortest(list){
   
181     return list.sort(function(a,b){ return b.length-a.length; });
   
182 }
   
183 
   184 list = ["aaaaaa", "b", "ccc", "ddddddddddddddddd"];
   
185 
   186 sortLongestToShortest(list);
   
187 
   188 document.write("<p style='font-family:monospace'><b>Len sorted:</b> ");
   
189 for(var i=0; i<list.length; i++){
   
190     document.write("\""+list[i].replace("\"","\\\"","g")+"\", ");
   
191 }
   
192 document.write("</p>");
   
193 
   194 
   195 </script>
   
196 </body>
   
197 </html>