move find-german-comments from build repo to bootstrap/bin

7be59837 · Miklos Vajna · 9b7439f8 · 7be59837 · 7be59837 · 7be59837
Kaydet (Commit) 7be59837 authored Ock 06, 2011 tarafından Miklos Vajna
7 changed files
--- a/bin/find-german-comments
+++ b/bin/find-german-comments
+#!/usr/bin/env python
+########################################################################
+#
+#  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
+#
+#  Permission is hereby granted, free of charge, to any person
+#  obtaining a copy of this software and associated documentation
+#  files (the "Software"), to deal in the Software without
+#  restriction, including without limitation the rights to use,
+#  copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following
+#  conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+#  OTHER DEALINGS IN THE SOFTWARE.
+#
+########################################################################
+
+
+import sys, re, subprocess, os, optparse, string
+
+class Parser:
+    """
+    This parser extracts comments from source files, tries to guess
+    their language and then prints out the german ones.
+    """
+    def __init__(self):
+        self.strip = string.punctuation + " \n"
+        op = optparse.OptionParser()
+        op.set_usage("%prog [options] <rootdir>\n\n" +
+            "Searches for german comments in cxx/hxx source files inside a given root\n" +
+            "directory recursively.")
+        op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
+            help="Turn on verbose mode (print progress to stderr)")
+        self.options, args = op.parse_args()
+        try:
+            dir = args[0]
+        except IndexError:
+            dir = "."
+        self.check_source_files(dir)
+
+    def get_comments(self, filename):
+        """
+        Extracts the source code comments.
+        """
+        linenum = 0
+        if self.options.verbose:
+            sys.stderr.write("processing file '%s'...\n" % filename)
+        sock = open(filename)
+        # add an empty line to trigger the output of collected oneliner
+        # comment group
+        lines = sock.readlines() + ["\n"]
+        sock.close()
+
+        in_comment = False
+        buf = []
+        count = 1
+        for i in lines:
+            if "//" in i and not in_comment:
+                # if we find a new //-style comment, then we
+                # just append it to a previous one if: there is
+                # only whitespace before the // mark that is
+                # necessary to make comments longer, giving
+                # more reliable output
+                if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
+                    s = re.sub(".*// ?", "", i).strip(self.strip)
+                    if len(s):
+                        buf.append(s)
+                else:
+                    # otherwise it's an independent //-style comment in the next line
+                    yield (count, "\n    ".join(buf))
+                    buf = [re.sub(".*// ?", "", i.strip(self.strip))]
+            elif "//" not in i and not in_comment and len(buf) > 0:
+                # first normal line after a // block
+                yield (count, "\n    ".join(buf))
+                buf = []
+            elif "/*" in i and "*/" not in i and not in_comment:
+                # start of a real multiline comment
+                in_comment = True
+                linenum = count
+                s = re.sub(".*/\*+", "", i.strip(self.strip))
+                if len(s):
+                    buf.append(s.strip(self.strip))
+            elif in_comment and not "*/" in i:
+                # in multiline comment
+                s = re.sub("^( |\|)*\*?", "", i)
+                if len(s.strip(self.strip)):
+                    buf.append(s.strip(self.strip))
+            elif "*/" in i and in_comment:
+                # end of multiline comment
+                in_comment = False
+                s = re.sub(r"\*+/.*", "", i.strip(self.strip))
+                if len(s):
+                    buf.append(s)
+                yield (count, "\n    ".join(buf))
+                buf = []
+            elif "/*" in i and "*/" in i:
+                # c-style oneliner comment
+                yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
+            count += 1
+
+    def get_lang(self, s):
+        """ the output is 'german' or 'english' or 'german or english'. when
+        unsure, just don't warn, there are strings where you just can't
+        teremine the results reliably, like '#110680#' """
+        cwd = os.getcwd()
+        # change to our directory
+        os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
+        sock = subprocess.Popen(["text_cat/text_cat", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        sock.stdin.write(s)
+        sock.stdin.close()
+        lang = sock.stdout.read().strip()
+        sock.stdout.close()
+        os.chdir(cwd)
+        return lang
+
+    def is_german(self, s):
+        """
+        determines if a string is german or not
+        """
+        # for short strings we can't do reliable recognition, so skip
+        # short strings and less than 4 words
+        s = s.replace('\n', ' ')
+        if len(s) < 32 or len(s.split()) < 4:
+            return False
+        return "german" == self.get_lang(s)
+
+    def check_file(self, path):
+        """
+        checks each comment in a file
+        """
+        for linenum, s in self.get_comments(path):
+            if self.is_german(s):
+                print "%s:%s: %s" % (path, linenum, s)
+
+    def check_source_files(self, dir):
+        """
+        checks each _tracked_ file in a directory recursively
+        """
+        sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % dir)
+        lines = sock.readlines()
+        sock.close()
+        for path in lines:
+            self.check_file(path.strip())
+
+try:
+    Parser()
+except KeyboardInterrupt:
+    print "Interrupted!"
+    sys.exit(0)
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab:
--- a/bin/text_cat/COPYING
+++ b/bin/text_cat/COPYING
--- a/bin/text_cat/Copyright
+++ b/bin/text_cat/Copyright
+Copyright (c) 1994, 1995, 1996, 1997 by Gertjan van Noord.
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the 
+    Free Software Foundation, Inc., 
+    51 Franklin Street, Fifth Floor, Boston, 
+    MA  02110-1301  USA
+
+cf. the file COPYING
+
+
--- a/bin/text_cat/LM/english.lm
+++ b/bin/text_cat/LM/english.lm
+_	 20326
+e	 6617
+t	 4843
+o	 3834
+n	 3653
+i	 3602
+a	 3433
+s	 2945
+r	 2921
+h	 2507
+e_	 2000
+d	 1816
+_t	 1785
+c	 1639
+l	 1635
+th	 1535
+he	 1351
+_th	 1333
+u	 1309
+f	 1253
+m	 1175
+p	 1151
+_a	 1145
+the	 1142
+_the	 1060
+s_	 978
+er	 968
+_o	 967
+he_	 928
+d_	 888
+t_	 885
+the_	 844
+_the_	 843
+on	 842
+in	 817
+y	 783
+n_	 773
+b	 761
+re	 754
+,	 734
+,_	 732
+an	 732
+g	 728
+w	 718
+_i	 707
+en	 676
+f_	 599
+y_	 595
+of	 594
+_of	 592
+es	 589
+ti	 587
+v	 580
+_of_	 575
+of_	 575
+nd	 568
+at	 549
+r_	 540
+_w	 534
+it	 522
+ed	 496
+_p	 494
+nt	 485
+_c	 462
+o_	 457
+io	 450
+_an	 439
+te	 432
+or	 425
+_b	 418
+nd_	 407
+to	 406
+st	 402
+is	 401
+_s	 396
+_in	 389
+ion	 385
+and	 385
+de	 384
+ve	 382
+ha	 375
+ar	 366
+_m	 361
+and_	 360
+_and	 360
+_and_	 358
+se	 353
+_to	 347
+me	 346
+to_	 344
+ed_	 339
+.	 330
+be	 329
+_f	 329
+._	 329
+_to_	 320
+co	 317
+ic	 316
+ns	 308
+al	 307
+le	 304
+ou	 304
+ce	 293
+ent	 279
+l_	 278
+_co	 277
+tio	 275
+on_	 274
+_d	 274
+tion	 268
+ri	 266
+_e	 264
+ng	 253
+hi	 251
+er_	 249
+ea	 246
+as	 245
+_be	 242
+pe	 242
+h_	 234
+_r	 232
+ec	 227
+ch	 223
+ro	 222
+ct	 220
+_h	 219
+pr	 217
+in_	 217
+ne	 214
+ll	 214
+rt	 213
+s,_	 210
+s,	 210
+li	 209
+ra	 208
+T	 207
+wh	 204
+a_	 203
+ac	 201
+_wh	 199
+_n	 196
+ts	 196
+di	 196
+es_	 195
+si	 194
+re_	 193
+at_	 192
+nc	 192
+ie	 190
+_a_	 188
+_in_	 185
+ing	 184
+us	 182
+_re	 182
+g_	 179
+ng_	 178
+op	 178
+con	 177
+tha	 175
+_l	 174
+_tha	 174
+ver	 173
+ma	 173
+ion_	 171
+_con	 171
+ci	 170
+ons	 170
+_it	 170
+po	 169
+ere	 168
+is_	 167
+ta	 167
+la	 166
+_pr	 165
+fo	 164
+ho	 164
+ir	 162
+ss	 161
+men	 160
+be_	 160
+un	 159
+ty	 159
+_be_	 158
+ing_	 157
+om	 156
+ot	 156
+hat	 155
+ly	 155
+_g	 155
+em	 153
+_T	 151
+rs	 150
+mo	 148
+ch_	 148
+wi	 147
+we	 147
+ad	 147
+ts_	 145
+res	 143
+_wi	 143
+I	 143
+hat_	 142
+ei	 141
+ly_	 141
+ni	 140
+os	 140
+ca	 139
+ur	 139
+A	 138
+ut	 138
+that	 138
+_that	 137
+ati	 137
+_fo	 137
+st_	 137
+il	 136
+or_	 136
+for	 136
+pa	 136
+ul	 135
+ate	 135
+ter	 134
+it_	 134
+nt_	 133
+that_	 132
+_ha	 129
+al_	 128
+el	 128
+as_	 127
+ll_	 127
+_ma	 125
+no	 124
+ment	 124
+an_	 124
+tion_	 122
+su	 122
+bl	 122
+_de	 122
+nce	 120
+pl	 120
+fe	 119
+tr	 118
+so	 118
+int	 115
+ov	 114
+e,	 114
+e,_	 114
+_u	 113
+ent_	 113
+Th	 113
+her	 113
+j	 112
+atio	 112
+ation	 112
+_Th	 111
+le_	 110
+ai	 110
+_it_	 110
+_on	 110
+_for	 109
+ect	 109
+k	 109
+hic	 108
+est	 108
+der	 107
+tu	 107
+na	 106
+_by_	 106
+by_	 106
+E	 106
+by	 106
+_by	 106
+ve_	 106
+_di	 106
+en_	 104
+vi	 104
+m_	 103
+_whi	 102
+iv	 102
+whi	 102
+ns_	 102
+_A	 101
+ich	 100
+ge	 100
+pro	 99
+ess	 99
+_whic	 99
+ers	 99
+hich	 99
+ce_	 99
+which	 99
+whic	 99
+all	 98
+ove	 98
+_is	 98
+ich_	 97
+ee	 97
+hich_	 97
+n,_	 96
+n,	 96
+im	 95
+ir_	 94
+hei	 94
+ions	 94
+sti	 94
+se_	 94
+per	 93
+The	 93
+_pa	 93
+heir	 93
+id	 93
+eir	 93
+eir_	 93
+ig	 93
+heir_	 93
+_no	 93
+ev	 93
+era	 92
+_int	 92
+ted	 91
+_The	 91
+ies	 91
+art	 91
+thei	 90
+_ar	 90
+_thei	 90
+their	 90
+_pro	 90
+et	 89
+_pe	 88
+_mo	 88
+ther	 88
+x	 87
+gh	 87
+S	 87
+_is_	 87
+ol	 87
+ty_	 87
+_I	 86
+nde	 86
+am	 86
+rn	 86
+nte	 86
+mp	 85
+_su	 84
+_we	 84
+par	 84
+_v	 84
+pu	 82
+his	 82
+ow	 82
+mi	 82
+go	 81
+N	 81
+ue	 81
+ple	 81
+ep	 80
+ab	 80
+;_	 80
+;	 80
+ex	 80
+ain	 80
+over	 80
+_un	 79
+q	 79
+qu	 79
+pp	 79
+ith	 79
+ry	 79
+_as	 79
+ber	 79
+ub	 78
+av	 78
+uc	 78
+s._	 77
+s.	 77
+enc	 77
+are	 77
+iti	 77
+gr	 76
+his_	 76
+ua	 76
+part	 76
+ff	 75
+eve	 75
+O	 75
+rea	 74
+ous	 74
+ia	 74
+The_	 73
+ag	 73
+mb	 73
+_go	 73
+fa	 72
+on,_	 72
+ern	 72
+t,_	 72
+on,	 72
+t,	 72
+_me	 71
--- a/bin/text_cat/LM/german.lm
+++ b/bin/text_cat/LM/german.lm
+_	 31586
+e	 15008
+n	 9058
+i	 7299
+r	 6830
+t	 5662
+s	 5348
+a	 4618
+h	 4176
+d	 4011
+er	 3415
+en	 3412
+u	 3341
+l	 3266
+n_	 2848
+c	 2636
+ch	 2460
+g	 2407
+o	 2376
+e_	 2208
+r_	 2128
+m	 2077
+_d	 1948
+de	 1831
+en_	 1786
+ei	 1718
+er_	 1570
+in	 1568
+te	 1505
+ie	 1505
+b	 1458
+t_	 1425
+f	 1306
+k	 1176
+ge	 1144
+s_	 1137
+un	 1113
+,	 1104
+,_	 1099
+w	 1099
+z	 1060
+nd	 1039
+he	 1004
+st	 989
+_s	 952
+_de	 949
+.	 909
+_e	 906
+ne	 906
+der	 880
+._	 847
+be	 841
+es	 829
+ic	 796
+_a	 791
+ie_	 779
+is	 769
+ich	 763
+an	 755
+re	 749
+di	 732
+ein	 730
+se	 730
+"	 720
+ng	 709
+_i	 706
+sc	 683
+sch	 681
+it	 673
+der_	 652
+h_	 651
+ch_	 642
+S	 630
+le	 609
+p	 609
+ä	 607
+ü	 603
+au	 603
+v	 602
+che	 599
+_w	 596
+d_	 585
+die	 576
+_di	 572
+m_	 562
+_die	 559
+el	 548
+_S	 540
+_der	 529
+li	 527
+_der_	 523
+si	 515
+al	 514
+ns	 507
+on	 501
+or	 495
+ti	 490
+ten	 487
+ht	 486
+die_	 485
+_die_	 483
+D	 479
+rt	 478
+nd_	 476
+_u	 470
+nt	 468
+A	 466
+in_	 464
+den	 461
+cht	 447
+und	 443
+me	 440
+_z	 429
+ung	 426
+ll	 423
+_un	 421
+_ei	 419
+_n	 415
+hr	 412
+ine	 412
+_A	 408
+_ein	 405
+ar	 404
+ra	 403
+_v	 400
+_g	 400
+as	 395
+zu	 392
+et	 389
+em	 385
+_D	 380
+eine	 376
+gen	 376
+g_	 376
+da	 368
+we	 366
+K	 365
+lt	 360
+B	 354
+_"	 353
+nde	 349
+ni	 347
+und_	 345
+E	 345
+ur	 345
+_m	 342
+ri	 341
+ha	 340
+eh	 339
+ten_	 338
+es_	 336
+_K	 336
+_und	 335
+ig	 335
+_b	 335
+hen	 334
+_und_	 332
+_au	 329
+_B	 327
+_da	 325
+_zu	 324
+_in	 322
+at	 321
+us	 318
+wi	 307
+n,	 305
+n,_	 304
+nn	 304
+te_	 301
+eit	 301
+_h	 300
+ter	 299
+M	 298
+n.	 295
+ß	 294
+ng_	 289
+sche	 289
+-	 283
+rs	 282
+den_	 282
+_si	 280
+G	 280
+im	 278
+_ge	 277
+chen	 276
+rd	 273
+_E	 273
+n._	 270
+icht	 270
+rn	 268
+uf	 267
+isch	 264
+isc	 264
+nen	 263
+_in_	 262
+_M	 260
+_er	 257
+ich_	 255
+ac	 253
+lic	 252
+_G	 252
+ber	 252
+la	 251
+vo	 251
+eb	 250
+ke	 249
+F	 248
+as_	 248
+hen_	 248
+ach	 245
+en,	 244
+ung_	 243
+lich	 243
+ste	 243
+en,_	 243
+_k	 241
+ben	 241
+_f	 241
+en.	 241
+_be	 239
+it_	 239
+L	 238
+_se	 237
+mi	 236
+ve	 236
+na	 236
+on_	 236
+P	 235
+ss	 234
+ist	 234
+ö	 234
+ht_	 233
+ru	 233
+st_	 229
+_F	 229
+ts	 227
+ab	 226
+W	 226
+ol	 225
+_eine	 225
+hi	 225
+so	 224
+em_	 223
+"_	 223
+ren	 222
+en._	 221
+chen_	 221
+R	 221
+ta	 221
+ere	 220
+ische	 219
+ers	 218
+ert	 217
+_P	 217
+tr	 217
+ed	 215
+ze	 215
+eg	 215
+ens	 215
+ür	 213
+ah	 212
+_vo	 212
+ne_	 211
+cht_	 210
+uc	 209
+_wi	 209
+nge	 208
+lle	 208
+fe	 207
+_L	 207
+ver	 206
+hl	 205
+V	 204
+ma	 203
+wa	 203
+auf	 201
+H	 198
+_W	 195
+T	 195
+nte	 193
+uch	 193
+l_	 192
+sei	 192
+nen_	 190
+u_	 189
+_den	 189
+_al	 189
+_V	 188
+t.	 188
+lte	 187
+ut	 186
+ent	 184
+sich	 183
+sic	 183
+il	 183
+ier	 182
+am	 181
+gen_	 180
+sen	 179
+fü	 178
+um	 178
+t._	 177
+f_	 174
+he_	 174
+ner	 174
+nst	 174
+ls	 174
+_sei	 173
+ro	 173
+ir	 173
+ebe	 173
+mm	 173
+ag	 172
+ern	 169
+t,_	 169
+t,	 169
+eu	 169
+ft	 168
+icht_	 167
+hre	 167
+Be	 166
+nz	 165
+nder	 165
+_T	 164
+_den_	 164
+iche	 163
+tt	 163
+zu_	 162
+and	 162
+J	 161
+rde	 160
+rei	 160
+_we	 159
+_H	 159
+ige	 159
+_Be	 158
+rte	 157
+hei	 156
+das	 155
+aus	 155
+che_	 154
+_das	 154
+_zu_	 154
+tz	 154
+_ni	 153
+das_	 153
+_R	 153
+N	 153
+des	 153
+_ve	 153
+_J	 152
+I	 152
+_das_	 152
+men	 151
+_so	 151
+_ver	 151
+_auf	 150
+ine_	 150
+_ha	 150
+rg	 149
+ind	 148
+eben	 148
+kt	 147
+mit	 147
+_an	 147
+her	 146
+Ge	 146
+Sc	 145
+_sich	 145
+U	 145
+Sch	 145
+_sic	 145
+end	 145
+Di	 144
+abe	 143
+ck	 143
+sse	 142
+ür_	 142
+ell	 142
+ik	 141
+o_	 141
+nic	 141
+nich	 141
+sa	 141
+_fü	 140
+hn	 140
+zi	 140
+no	 140
+nicht	 140
+im_	 139
+von_	 139
+von	 139
+_nic	 139
+_nich	 139
+eine_	 139
+oc	 138
+wei	 138
+io	 138
+schen	 138
+gt	 138
--- a/bin/text_cat/text_cat
+++ b/bin/text_cat/text_cat
+#!/usr/bin/perl -w
+#  Gertjan van Noord, 1997.
+# mailto:vannoord@let.rug.nl
+
+use strict;
+use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
+use Getopt::Std;
+use Benchmark;
+
+my $non_word_characters='0-9\s';
+
+# OPTIONS
+getopts('a:d:f:hi:lnst:u:v');
+
+# defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
+$opt_a ||= 10;
+$opt_d ||= '/users1/vannoord/Perl/TextCat/LM';
+$opt_f ||= 0;
+$opt_t ||= 400;
+$opt_u ||= 1.05;
+
+sub help {
+    print <<HELP
+Text Categorization. Typically used to determine the language of a
+given document. 
+
+Usage
+-----
+
+* print help message:
+
+$0 -h
+
+* for guessing: 
+
+$0 [-a Int] [-d Dir] [-f Int] [-i N] [-l] [-t Int] [-u Int] [-v]
+
+    -a    the program returns the best-scoring language together
+          with all languages which are $opt_u times worse (cf option -u). 
+          If the number of languages to be printed is larger than the value 
+          of this option (default: $opt_a) then no language is returned, but
+          instead a message that the input is of an unknown language is
+          printed. Default: $opt_a.
+    -d    indicates in which directory the language models are 
+          located (files ending in .lm). Currently only a single 
+          directory is supported. Default: $opt_d.
+    -f    Before sorting is performed the Ngrams which occur this number 
+          of times or less are removed. This can be used to speed up
+          the program for longer inputs. For short inputs you should use
+          -f 0.
+          Default: $opt_f.
+    -i N  only read first N lines
+    -l    indicates that input is given as an argument on the command line,
+          e.g. text_cat -l "this is english text"
+          Cannot be used in combination with -n.
+    -s    Determine language of each line of input. Not very efficient yet,
+          because language models are re-loaded after each line.
+    -t    indicates the topmost number of ngrams that should be used. 
+          If used in combination with -n this determines the size of the 
+          output. If used with categorization this determines
+          the number of ngrams that are compared with each of the language
+          models (but each of those models is used completely). 
+    -u    determines how much worse result must be in order not to be 
+          mentioned as an alternative. Typical value: 1.05 or 1.1. 
+          Default: $opt_u.
+    -v    verbose. Continuation messages are written to standard error.
+
+* for creating new language model, based on text read from standard input:
+
+$0 -n [-v]
+
+    -v    verbose. Continuation messages are written to standard error.
+
+
+HELP
+}
+
+if ($opt_h) { help(); exit 0; };
+
+if ($opt_n) { 
+    my %ngram=();
+    my @result = create_lm(input(),\%ngram);
+    print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
+} elsif ($opt_l) {
+    classify($ARGV[0]);
+} elsif ($opt_s) {
+    while (<>) {
+	chomp;
+	classify($_);
+    }
+} else { 
+    classify(input()); 
+}
+
+# CLASSIFICATION
+sub classify {
+  my ($input)=@_;
+  my %results=();
+  my $maxp = $opt_t;
+  # open directory to find which languages are supported
+  opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
+  my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
+  closedir DIR;
+  @languages or die "sorry, can't read any language models from $opt_d\n" .
+    "language models must reside in files with .lm ending\n";
+
+
+  # create ngrams for input. Note that hash %unknown is not used;
+  # it contains the actual counts which are only used under -n: creating
+  # new language model (and even then they are not really required).
+  my @unknown=create_lm($input);
+  # load model and count for each language.
+  my $language;
+  my $t1 = new Benchmark;
+  foreach $language (@languages) {
+    # loads the language model into hash %$language.
+    my %ngram=();
+    my $rang=1;
+    open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
+    while (<LM>) {
+      chomp;
+      # only use lines starting with appropriate character. Others are
+      # ignored.
+      if (/^[^$non_word_characters]+/o) {
+	$ngram{$&} = $rang++;
+      } 
+    }
+    close(LM);
+    #print STDERR "loaded language model $language\n" if $opt_v;
+    
+    # compares the language model with input ngrams list
+    my ($i,$p)=(0,0);
+    while ($i < @unknown) {
+      if ($ngram{$unknown[$i]}) {
+	$p=$p+abs($ngram{$unknown[$i]}-$i);
+      } else { 
+	$p=$p+$maxp; 
+      }
+      ++$i;
+    }
+    #print STDERR "$language: $p\n" if $opt_v;
+    
+    $results{$language} = $p;
+  }
+  print STDERR "read language models done (" . 
+    timestr(timediff(new Benchmark, $t1)) . 
+      ".\n" if $opt_v;
+  my @results = sort { $results{$a} <=> $results{$b} } keys %results;
+  
+  print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
+  my $a = $results{$results[0]};
+  
+  my @answers=(shift(@results));
+  while (@results && $results{$results[0]} < ($opt_u *$a)) {
+    @answers=(@answers,shift(@results));
+  }
+  if (@answers > $opt_a) {
+    print "I don't know; " .
+      "Perhaps this is a language I haven't seen before?\n";
+  } else {
+    print join(" or ", @answers), "\n";
+  }
+}
+
+# first and only argument is reference to hash.
+# this hash is filled, and a sorted list (opt_n elements)
+# is returned.
+sub input {
+    my $read="";
+    if ($opt_i) {
+	while(<>) {
+	    if ($. == $opt_i) {
+		return $read . $_;
+	    }
+	    $read = $read . $_;
+	}
+	return $read;
+    } else {
+	local $/;     # so it doesn't affect $/ elsewhere
+	undef $/;
+	$read = <>;      # swallow input.
+	$read || die "determining the language of an empty file is hard...\n";
+	return $read;
+    }
+}
+
+
+sub create_lm {
+  my $t1 = new Benchmark;
+  my $ngram;
+  ($_,$ngram) = @_;  #$ngram contains reference to the hash we build
+    # then add the ngrams found in each word in the hash
+  my $word;
+  foreach $word (split("[$non_word_characters]+")) {
+    $word = "_" . $word . "_";
+    my $len = length($word);
+    my $flen=$len;
+    my $i;
+    for ($i=0;$i<$flen;$i++) {
+      $$ngram{substr($word,$i,5)}++ if $len > 4;
+      $$ngram{substr($word,$i,4)}++ if $len > 3;
+      $$ngram{substr($word,$i,3)}++ if $len > 2;
+      $$ngram{substr($word,$i,2)}++ if $len > 1;
+      $$ngram{substr($word,$i,1)}++;
+      $len--;
+    }
+  }
+  ###print "@{[%$ngram]}";
+  my $t2 = new Benchmark;
+  print STDERR "count_ngrams done (". 
+    timestr(timediff($t2, $t1)) .").\n" if $opt_v;
+
+  # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
+  # sorting by removing singletons
+  map { my $key=$_; if ($$ngram{$key} <= $opt_f) 
+             { delete $$ngram{$key}; }; } keys %$ngram;
+  #however I have very bad results for short inputs, this way
+
+  
+  # sort the ngrams, and spit out the $opt_t frequent ones.
+  # adding  `or $a cmp $b' in the sort block makes sorting five
+  # times slower..., although it would be somewhat nicer (unique result)
+  my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
+  splice(@sorted,$opt_t) if (@sorted > $opt_t); 
+  print STDERR "sorting done (" . 
+    timestr(timediff(new Benchmark, $t2)) . 
+      ").\n" if $opt_v;
+  return @sorted;
+}
--- a/bin/text_cat/version
+++ b/bin/text_cat/version
+1.10
+