首页 > 代码库 > java大作业 KShinglingAlgorithm
java大作业 KShinglingAlgorithm
wiki上关于KShingling Algorithm(w-shingling)的说明:
http://en.wikipedia.org/wiki/W-shingling
摘要:
In natural language processing a w-shingling is a set of unique "shingles"—contiguous subsequences of tokens in a document—that can be used to gauge the similarity of two documents. The w denotes the number of tokens in each shingle in the set.
The document, "a rose is a rose is a rose" can be tokenized as follows:
- (a,rose,is,a,rose,is,a,rose)
The set of all contiguous sequences of 4 tokens (N-grams, here: 4-grams) is
- { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is), (a,rose,is,a), (rose,is,a,rose) } = { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is) }
我理解的此算法,是把每段文本都像上述分解后,统计两段文本的合集b,再统计交集a,用a/b得到相似度。
写得有些复杂:
1 package bigproject2; 2 3 import javax.swing.JOptionPane; 4 5 public class union { 6 //求子集 7 public String[] ziji(String str) 8 { 9 char[] ch=str.toCharArray(); 10 int c=0; 11 for(int i=0;i<ch.length;i++) 12 { 13 if(ch[i]==‘ ‘) 14 c++; 15 } 16 //建立单词数组 17 String[] strt=new String[c+1]; 18 for(int i=0;i<c+1;i++) 19 strt[i]=""; 20 int h=0; 21 for(int i=0;i<c+1;i++) 22 { 23 for(int j=h;j<ch.length;j++) 24 { 25 if(ch[j]==‘ ‘) 26 { 27 h=j+1; 28 break; 29 } 30 else strt[i]+=ch[j]; 31 } 32 } 33 return strt; 34 } 35 //按k分,并去掉重复子集。 36 public String[] cut(String[] str,int k) throws MyException{ 37 if(str.length<k) 38 throw new MyException("单词数少于"+k+",无法进行计算!"); 39 String[] t=new String[str.length-k+1]; 40 for(int i=0;i<str.length-k+1;i++) 41 t[i]=""; 42 int h=0,m=0; 43 for(;h<str.length-k+1;h++) 44 { 45 for(int i=m;i<m+k;i++) 46 t[h]+=str[i]; 47 m++; 48 } 49 //去掉重复部分 50 int merge=0; 51 for(int i=0;i<t.length-1;i++) 52 { 53 if(t[i].equals("")) break; 54 for(int j=i+1;j<t.length;j++) 55 { 56 if(t[i].equals(t[j])) 57 { 58 merge++; 59 int y=j; 60 for(;y<t.length-1;y++) 61 { 62 t[y]=t[y+1]; 63 } 64 t[y]=""; 65 } 66 } 67 } 68 String[] fin=new String[t.length-merge]; 69 for(int i=0;i<t.length-merge;i++) 70 fin[i]=t[i]; 71 return fin; 72 } 73 public class MyException extends Exception{ 74 public MyException(String str){ 75 JOptionPane.showMessageDialog(null, str,"警告", JOptionPane.INFORMATION_MESSAGE); 76 } 77 } 78 //求两字符串数组合集个数。 79 public int heji(String[] a,String[] b){ 80 int count=a.length+b.length; 81 for(int i=0;i<a.length;i++) 82 { 83 for(int j=0;j<b.length;j++) 84 { 85 if(a[i].equals(b[j])) 86 count--; 87 } 88 } 89 return count; 90 } 91 //求两字符串数组交集个数。 92 public int jiaoji(String[] a,String[] b){ 93 int count=0; 94 for(int i=0;i<a.length;i++) 95 { 96 for(int j=0;j<b.length;j++) 97 { 98 if(a[i].equals(b[j])) 99 count++;100 }101 }102 return count;103 }104 105 }
1 package bigproject2; 2 3 4 public class KShinglingAlgorithm extends union{ 5 private String text1,text2; 6 public String getText1() 7 { 8 return text1; 9 }10 public String getText2()11 {12 return text2;13 }14 public void setText1(String text1)15 {16 this.text1=text1;17 }18 public void setText2(String text2)19 {20 this.text2=text2;21 }22 23 public float getSimilarity(int k)24 {25 union a=new union();26 String[] t1=a.ziji(this.text1);27 String[] t2=a.ziji(this.text2);28 String[] t1t,t2t;29 try{30 t1t=a.cut(t1, k);31 t2t=a.cut(t2, k);32 33 }catch(MyException e){34 return -1;35 }36 int he=a.heji(t1t, t2t);37 int jiao=a.jiaoji(t1t, t2t);38 return (float)jiao/he;39 }40 41 }
面板设计部分:
1 package bigproject2; 2 import java.awt.*; 3 import java.awt.event.*; 4 import java.io.BufferedReader; 5 import java.io.File; 6 import java.io.FileNotFoundException; 7 import java.io.FileReader; 8 import java.io.IOException; 9 import java.io.InputStreamReader; 10 11 import javax.swing.*; 12 import javax.swing.event.*; 13 import javax.swing.filechooser.FileNameExtensionFilter; 14 15 public class Outlook extends JFrame{ 16 JFrame frm=new JFrame("相似度计算器"); 17 JPanel areabottom=new JPanel(); 18 JPanel areatop=new JPanel(); 19 JPanel areamiddle=new JPanel(); 20 static JTextArea tl=new JTextArea(); 21 static JTextArea tr=new JTextArea(); 22 JScrollPane left=new JScrollPane(tl,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, 23 JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); 24 JScrollPane right=new JScrollPane(tr,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, 25 JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); 26 JSplitPane sp=new JSplitPane(JSplitPane.HORIZONTAL_SPLIT,left,right); 27 static JButton toBig=new JButton("全部大写"); 28 static JButton delbd=new JButton("去掉标点"); 29 static JButton count=new JButton("计算相似度"); 30 JLabel space=new JLabel(" "); 31 JLabel t1=new JLabel("Text1"); 32 JLabel t2=new JLabel("Text2"); 33 34 JMenuBar mb=new JMenuBar(); 35 JMenu open=new JMenu("打开"); 36 JMenuItem opent1=new JMenuItem("打开到Text1"); 37 JMenuItem opent2=new JMenuItem("打开到Text2"); 38 39 private String str=""; 40 public Outlook() 41 { 42 judge(); 43 44 frm.setVisible(true); 45 frm.setBounds(50, 50, 500, 400); 46 frm.setLayout(new BorderLayout(5,5)); 47 48 frm.add("North",areatop); 49 frm.add("Center",areamiddle); 50 frm.add("South",areabottom); 51 52 areatop.add(mb); 53 mb.add(open); 54 open.add(opent1); 55 open.add(opent2); 56 open.setPreferredSize(new Dimension(40,18)); 57 mb.setBackground(frm.getBackground()); 58 areatop.setLayout(new FlowLayout(FlowLayout.LEFT)); 59 areamiddle.setLayout(new FlowLayout(FlowLayout.LEFT)); 60 61 areamiddle.add(t1); 62 t1.setPreferredSize(new Dimension(frm.getWidth()/2-20,10)); 63 areamiddle.add(t2); 64 t2.setPreferredSize(new Dimension(50,10)); 65 areamiddle.add(left); 66 left.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2)); 67 areamiddle.add(right); 68 right.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2)); 69 tl.setLineWrap(true); 70 tr.setLineWrap(true); 71 72 areabottom.add(toBig); 73 areabottom.add(delbd); 74 areabottom.add(space); 75 areabottom.add(count); 76 77 opent1.addActionListener(new ActionListener(){ 78 public void actionPerformed(ActionEvent e) { 79 try { 80 openfile(); 81 tl.setText(str); 82 } catch (IOException e1) { 83 e1.printStackTrace(); 84 } 85 judge(); 86 } 87 }); 88 opent2.addActionListener(new ActionListener(){ 89 public void actionPerformed(ActionEvent e) { 90 try { 91 openfile(); 92 tr.setText(str); 93 } catch (IOException e1) { 94 e1.printStackTrace(); 95 } 96 judge(); 97 } 98 }); 99 toBig.addActionListener(new ActionListener(){100 public void actionPerformed(ActionEvent e){101 tl.setText(tobig(tl.getText()));102 tr.setText(tobig(tr.getText()));103 }104 });105 106 delbd.addActionListener(new ActionListener(){107 public void actionPerformed(ActionEvent e){108 tl.setText(del(tl.getText()));109 tr.setText(del(tr.getText()));110 judge();111 }112 113 });114 count.addActionListener(new ActionListener(){115 public void actionPerformed(ActionEvent e){116 KShinglingAlgorithm a=new KShinglingAlgorithm();117 a.setText1(tl.getText());118 a.setText2(tr.getText());119 float b=a.getSimilarity(4);120 if(b!=-1)121 JOptionPane.showMessageDialog(null, Float.toString(b),"相似度", JOptionPane.INFORMATION_MESSAGE); 122 }123 });124 tr.addKeyListener(new KeyAdapter(){125 public void keyTyped(KeyEvent e){126 judge();127 }128 });129 tl.addKeyListener(new KeyAdapter(){130 public void keyTyped(KeyEvent e){131 judge();132 }133 });134 }135 public void judge(){136 if(tl.getText().length()!=0||tr.getText().length()!=0) {137 toBig.setEnabled(true);138 delbd.setEnabled(true);139 count.setEnabled(true);140 }141 else{142 toBig.setEnabled(false);143 delbd.setEnabled(false);144 count.setEnabled(false);145 } 146 }147 public void openfile() throws IOException{148 str="";149 JFileChooser choose=new JFileChooser(); 150 int result = choose.showOpenDialog(this);151 File file = null; //注意初始化152 //加过滤器153 if (result == JFileChooser.APPROVE_OPTION) {154 file = choose.getSelectedFile();155 }156 else{157 return; //使点取消后不会抛出异常158 }159 FileReader fr=new FileReader(file);160 BufferedReader br=new BufferedReader(fr);161 char c[]=new char[512];162 String strline="";163 while(br.ready()){164 strline=br.readLine();165 str+=strline;166 };167 br.close();168 fr.close();169 }170 public String tobig(String str){171 String temp="";172 for(int i=0;i<str.length();i++)173 {174 if(str.charAt(i)>=‘a‘&&str.charAt(i)<=‘z‘)175 {176 char t=str.charAt(i);177 t=(char)(str.charAt(i)-32);178 temp+=t;179 }180 else temp+=str.charAt(i);181 }182 return temp;183 }184 185 public String del(String str){186 String temp="";187 for(int i=0;i<str.length();i++)188 {189 char t=str.charAt(i);190 if(t>=‘!‘&&t<=‘/‘||t>=58&&t<=64||t>=91&&t<=96||t>=123&&t<=126);191 else temp+=t;192 }193 return temp;194 }195 public static void main(String[] args){196 new Outlook();197 198 199 }200 }
java大作业 KShinglingAlgorithm
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。