首页 > 代码库 > java大作业 KShinglingAlgorithm

java大作业 KShinglingAlgorithm

wiki上关于KShingling Algorithm(w-shingling)的说明:

http://en.wikipedia.org/wiki/W-shingling

摘要:

In natural language processing a w-shingling is a set of unique "shingles"—contiguous subsequences of tokens in a document—that can be used to gauge the similarity of two documents. The w denotes the number of tokens in each shingle in the set.

The document, "a rose is a rose is a rose" can be tokenized as follows:

(a,rose,is,a,rose,is,a,rose)

The set of all contiguous sequences of 4 tokens (N-grams, here: 4-grams) is

{ (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is), (a,rose,is,a), (rose,is,a,rose) } = { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is) }

我理解的此算法,是把每段文本都像上述分解后,统计两段文本的合集b,再统计交集a,用a/b得到相似度。

写得有些复杂:

  1 package bigproject2;  2   3 import javax.swing.JOptionPane;  4   5 public class union {  6     //求子集  7     public String[] ziji(String str)  8     {  9         char[] ch=str.toCharArray(); 10         int c=0; 11         for(int i=0;i<ch.length;i++) 12         { 13             if(ch[i]==‘ ‘) 14                 c++; 15         } 16         //建立单词数组 17         String[] strt=new String[c+1]; 18         for(int i=0;i<c+1;i++) 19             strt[i]=""; 20         int h=0; 21         for(int i=0;i<c+1;i++) 22         { 23             for(int j=h;j<ch.length;j++) 24             { 25                 if(ch[j]==‘ ‘) 26                 { 27                     h=j+1; 28                     break; 29                 } 30                 else strt[i]+=ch[j]; 31             } 32         } 33         return strt; 34     } 35     //按k分,并去掉重复子集。 36     public String[] cut(String[] str,int k) throws MyException{ 37         if(str.length<k) 38                 throw new MyException("单词数少于"+k+",无法进行计算!"); 39         String[] t=new String[str.length-k+1]; 40         for(int i=0;i<str.length-k+1;i++) 41             t[i]=""; 42         int h=0,m=0; 43         for(;h<str.length-k+1;h++) 44         { 45             for(int i=m;i<m+k;i++) 46                 t[h]+=str[i]; 47             m++; 48         } 49         //去掉重复部分 50         int merge=0; 51         for(int i=0;i<t.length-1;i++) 52         { 53             if(t[i].equals("")) break; 54             for(int j=i+1;j<t.length;j++) 55             { 56                 if(t[i].equals(t[j])) 57                 { 58                     merge++; 59                     int y=j; 60                     for(;y<t.length-1;y++) 61                     { 62                         t[y]=t[y+1]; 63                     } 64                     t[y]=""; 65                 } 66             } 67         } 68         String[] fin=new String[t.length-merge]; 69         for(int i=0;i<t.length-merge;i++) 70             fin[i]=t[i]; 71         return fin; 72     } 73     public class MyException extends Exception{ 74         public MyException(String str){ 75             JOptionPane.showMessageDialog(null, str,"警告", JOptionPane.INFORMATION_MESSAGE); 76         } 77     } 78     //求两字符串数组合集个数。 79     public int heji(String[] a,String[] b){ 80         int count=a.length+b.length; 81         for(int i=0;i<a.length;i++) 82         { 83             for(int j=0;j<b.length;j++) 84             { 85                 if(a[i].equals(b[j])) 86                     count--; 87             } 88         } 89         return count; 90     } 91     //求两字符串数组交集个数。 92     public int jiaoji(String[] a,String[] b){ 93         int count=0; 94         for(int i=0;i<a.length;i++) 95         { 96             for(int j=0;j<b.length;j++) 97             { 98                 if(a[i].equals(b[j])) 99                     count++;100             }101         }102         return count;103     }104 105 }

 

 1 package bigproject2; 2  3  4 public class KShinglingAlgorithm extends union{ 5     private String text1,text2; 6     public String getText1() 7     { 8         return text1; 9     }10     public String getText2()11     {12         return text2;13     }14     public void setText1(String text1)15     {16         this.text1=text1;17     }18     public void setText2(String text2)19     {20         this.text2=text2;21     }22     23     public float getSimilarity(int k)24     {25        union a=new union();26        String[] t1=a.ziji(this.text1);27        String[] t2=a.ziji(this.text2);28        String[] t1t,t2t;29        try{30            t1t=a.cut(t1, k);31            t2t=a.cut(t2, k);32            33        }catch(MyException e){34                return -1;35        }36        int he=a.heji(t1t, t2t);37        int jiao=a.jiaoji(t1t, t2t);38        return (float)jiao/he;39     }40 41 }

 

 

面板设计部分:

技术分享
  1 package bigproject2;  2 import java.awt.*;  3 import java.awt.event.*;  4 import java.io.BufferedReader;  5 import java.io.File;  6 import java.io.FileNotFoundException;  7 import java.io.FileReader;  8 import java.io.IOException;  9 import java.io.InputStreamReader; 10  11 import javax.swing.*; 12 import javax.swing.event.*; 13 import javax.swing.filechooser.FileNameExtensionFilter; 14  15 public class Outlook extends JFrame{ 16     JFrame frm=new JFrame("相似度计算器"); 17     JPanel areabottom=new JPanel(); 18     JPanel areatop=new JPanel(); 19     JPanel areamiddle=new JPanel(); 20     static JTextArea tl=new JTextArea(); 21     static JTextArea tr=new JTextArea(); 22     JScrollPane left=new JScrollPane(tl,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, 23             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); 24     JScrollPane right=new JScrollPane(tr,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, 25             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); 26     JSplitPane sp=new JSplitPane(JSplitPane.HORIZONTAL_SPLIT,left,right); 27     static JButton toBig=new JButton("全部大写"); 28     static JButton delbd=new JButton("去掉标点"); 29     static JButton count=new JButton("计算相似度"); 30     JLabel space=new JLabel("                                               "); 31     JLabel t1=new JLabel("Text1"); 32     JLabel t2=new JLabel("Text2"); 33  34     JMenuBar mb=new JMenuBar(); 35     JMenu open=new JMenu("打开"); 36     JMenuItem opent1=new JMenuItem("打开到Text1"); 37     JMenuItem opent2=new JMenuItem("打开到Text2"); 38      39     private String str=""; 40     public Outlook() 41     { 42         judge(); 43          44         frm.setVisible(true); 45         frm.setBounds(50, 50, 500, 400); 46         frm.setLayout(new BorderLayout(5,5)); 47          48         frm.add("North",areatop); 49         frm.add("Center",areamiddle); 50         frm.add("South",areabottom); 51          52         areatop.add(mb); 53         mb.add(open);         54         open.add(opent1); 55         open.add(opent2); 56         open.setPreferredSize(new Dimension(40,18)); 57         mb.setBackground(frm.getBackground()); 58         areatop.setLayout(new FlowLayout(FlowLayout.LEFT)); 59         areamiddle.setLayout(new FlowLayout(FlowLayout.LEFT)); 60          61         areamiddle.add(t1); 62         t1.setPreferredSize(new Dimension(frm.getWidth()/2-20,10)); 63         areamiddle.add(t2); 64         t2.setPreferredSize(new Dimension(50,10)); 65         areamiddle.add(left); 66         left.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2));     67         areamiddle.add(right); 68         right.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2)); 69         tl.setLineWrap(true); 70         tr.setLineWrap(true); 71          72         areabottom.add(toBig); 73         areabottom.add(delbd); 74         areabottom.add(space); 75         areabottom.add(count); 76          77         opent1.addActionListener(new ActionListener(){ 78             public void actionPerformed(ActionEvent e) { 79                 try { 80                     openfile(); 81                     tl.setText(str); 82                 } catch (IOException e1) { 83                     e1.printStackTrace(); 84                 } 85                 judge(); 86             } 87         }); 88         opent2.addActionListener(new ActionListener(){ 89             public void actionPerformed(ActionEvent e) { 90                 try { 91                     openfile(); 92                     tr.setText(str); 93                 } catch (IOException e1) { 94                     e1.printStackTrace(); 95                 } 96                 judge(); 97             } 98         }); 99         toBig.addActionListener(new ActionListener(){100             public void actionPerformed(ActionEvent e){101                 tl.setText(tobig(tl.getText()));102                 tr.setText(tobig(tr.getText()));103             }104         });105         106         delbd.addActionListener(new ActionListener(){107             public void actionPerformed(ActionEvent e){108                 tl.setText(del(tl.getText()));109                 tr.setText(del(tr.getText()));110                 judge();111             }112             113         });114         count.addActionListener(new ActionListener(){115             public void actionPerformed(ActionEvent e){116                 KShinglingAlgorithm a=new KShinglingAlgorithm();117                 a.setText1(tl.getText());118                 a.setText2(tr.getText());119                 float b=a.getSimilarity(4);120                 if(b!=-1)121                     JOptionPane.showMessageDialog(null, Float.toString(b),"相似度", JOptionPane.INFORMATION_MESSAGE); 122             }123         });124         tr.addKeyListener(new KeyAdapter(){125             public void keyTyped(KeyEvent e){126                 judge();127             }128         });129         tl.addKeyListener(new KeyAdapter(){130             public void keyTyped(KeyEvent e){131                 judge();132             }133         });134     }135     public void judge(){136         if(tl.getText().length()!=0||tr.getText().length()!=0) {137             toBig.setEnabled(true);138             delbd.setEnabled(true);139             count.setEnabled(true);140         }141         else{142             toBig.setEnabled(false);143             delbd.setEnabled(false);144             count.setEnabled(false);145         }    146     }147     public void openfile() throws IOException{148         str="";149         JFileChooser choose=new JFileChooser();        150         int result = choose.showOpenDialog(this);151         File file = null; //注意初始化152         //加过滤器153         if (result == JFileChooser.APPROVE_OPTION) {154             file = choose.getSelectedFile();155             }156         else{157             return; //使点取消后不会抛出异常158         }159         FileReader fr=new FileReader(file);160         BufferedReader br=new BufferedReader(fr);161         char c[]=new char[512];162         String strline="";163         while(br.ready()){164             strline=br.readLine();165             str+=strline;166         };167         br.close();168         fr.close();169     }170     public String tobig(String str){171         String temp="";172         for(int i=0;i<str.length();i++)173         {174             if(str.charAt(i)>=‘a‘&&str.charAt(i)<=‘z‘)175             {176                 char t=str.charAt(i);177                 t=(char)(str.charAt(i)-32);178                 temp+=t;179             }180             else temp+=str.charAt(i);181         }182         return temp;183     }184     185     public String del(String str){186         String temp="";187         for(int i=0;i<str.length();i++)188         {189             char t=str.charAt(i);190             if(t>=‘!‘&&t<=‘/‘||t>=58&&t<=64||t>=91&&t<=96||t>=123&&t<=126);191             else temp+=t;192         }193         return temp;194     }195     public static void main(String[] args){196         new Outlook();197         198         199     }200 }
Outlook

 

java大作业 KShinglingAlgorithm