Clover Coverage Report
Coverage timestamp: Fri May 9 2008 10:54:27 EST
45   111   3   22.5
2   56   0.07   2
2     1.5  
1    
 
  TestStandardAnalyzer       Line # 24 45 3 100% 1.0
 
  (1)
 
1    package org.apache.lucene.analysis;
2   
3    import junit.framework.TestCase;
4    import org.apache.lucene.analysis.standard.StandardAnalyzer;
5   
6    import java.io.StringReader;
7   
8    /**
9    * Copyright 2004 The Apache Software Foundation
10    * <p/>
11    * Licensed under the Apache License, Version 2.0 (the "License");
12    * you may not use this file except in compliance with the License.
13    * You may obtain a copy of the License at
14    * <p/>
15    * http://www.apache.org/licenses/LICENSE-2.0
16    * <p/>
17    * Unless required by applicable law or agreed to in writing, software
18    * distributed under the License is distributed on an "AS IS" BASIS,
19    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20    * See the License for the specific language governing permissions and
21    * limitations under the License.
22    */
23   
 
24    public class TestStandardAnalyzer extends TestCase {
25   
 
26  37 toggle public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
27  37 TokenStream ts = a.tokenStream("dummy", new StringReader(input));
28  100 for (int i = 0; i < expected.length; i++) {
29  63 Token t = ts.next();
30  63 assertNotNull(t);
31  63 assertEquals(expected[i], t.termText());
32    }
33  37 assertNull(ts.next());
34  37 ts.close();
35    }
36   
37   
 
38  1 toggle public void testStandard() throws Exception {
39  1 Analyzer a = new StandardAnalyzer();
40   
41    // alphanumeric tokens
42  1 assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
43  1 assertAnalyzesTo(a, "2B", new String[]{"2b"});
44   
45    // underscores are delimiters, but not in email addresses (below)
46  1 assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
47  1 assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
48   
49    // other delimiters: "-", "/", ","
50  1 assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase" });
51  1 assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
52  1 assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
53   
54    // internal apostrophes: O'Reilly, you're, O'Reilly's
55    // possessives are actually removed by StardardFilter, not the tokenizer
56  1 assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
57  1 assertAnalyzesTo(a, "you're", new String[]{"you're"});
58  1 assertAnalyzesTo(a, "she's", new String[]{"she"});
59  1 assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
60  1 assertAnalyzesTo(a, "don't", new String[]{"don't"});
61  1 assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
62   
63    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
64    // to correctly search for these terms:
65  1 assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
66  1 assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
67    // 'a' is still a stopword:
68  1 assertAnalyzesTo(a, "a-class", new String[]{"class"});
69   
70    // company names
71  1 assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
72  1 assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
73   
74    // domain names
75  1 assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org" });
76   
77    // email addresses, possibly with underscores, periods, etc
78  1 assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
79  1 assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
80  1 assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
81   
82    // floating point, serial, model numbers, ip addresses, etc.
83    // every other segment must have at least one digit
84  1 assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
85  1 assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
86  1 assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
87  1 assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
88  1 assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
89  1 assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
90   
91    // numbers
92  1 assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
93   
94    // various
95  1 assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
96  1 assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
97  1 assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
98  1 assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
99   
100    // acronyms have their dots stripped
101  1 assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
102   
103    // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
104  1 assertAnalyzesTo(a, "C++", new String[]{"c"});
105  1 assertAnalyzesTo(a, "C#", new String[]{"c"});
106   
107    // Korean words
108  1 assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
109   
110    }
111    }