หลักการ Search Engine เครดิต http://www.narisa.com/forums/index.php?showtopic=29167 ไฟล์ ReadTxt.java import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import javax.swing.JOptionPane; public class ReadTxt { String[] fileName; public String[] getFileName() { return fileName; } int count = 0,chCount=0,numCount=0; public int getCount() { return count; } String[] txtFile; ReadTxt(){ SearchDoc(); txtFile = new String[fileName.length]; for(int count = 0;count<fileName.length;count++){ File file = new File("doc/"+fileName[count]+".txt"); FileInputStream fis = null; BufferedInputStream bis = null; DataInputStream dis = null; StringBuffer bufText = new StringBuffer(); try { fis = new FileInputStream(file); // Here BufferedInputStream is added for fast reading. bis = new BufferedInputStream(fis); dis = new DataInputStream(bis); // dis.available() returns 0 if the file does not have more lines. while (dis.available() != 0) { // this statement reads the line from the file and print it to // the console. //a // System.out.println(dis.readLine()); bufText.append(dis.readLine()); } // dispose all the resources after using them. // JOptionPane.showMessageDialog(null, ":"+bufText.toString()+":"); if(bufText.toString().trim().equals("")) chCount++; txtFile[count] = bufText.toString(); fis.close(); bis.close(); dis.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /* int txtHave =0 ; for(int i=0;i<count;i++){ if(!"".equals(txtFile[i])){ fileName[txtHave]=fileName[i]; txtFile[txtHave]=txtFile[i]; txtHave++; } } //count-=chCount; StringBuffer buffKor = new StringBuffer(); docList =new String[count]; for(int count = 0;count<docList.length;count++){ try{ DataInputStream dos = new DataInputStream(new FileInputStream("doc/"+check[count]+".txt")); for(int i =1;i<=num;++i){ buffKor.append(dos.readUTF()); num++; } System.out.println("bon"+buffKor.toString()); } catch(FileNotFoundException err){} catch(IOException err){} docList[count] = buffKor.toString(); buffKor.delete(0, buffKor.length()); }*/ } public String[] getTxtFile() { return txtFile; } public void setTxtFile(String[] txtFile) { this.txtFile = txtFile; } public void SearchDoc(){ File inputWorkbook = new File("doc"); File[] num = inputWorkbook.listFiles(); fileName = new String[num.length] ; for(int j =0;j<num.length;j++){ fileName[j]=num[j].getAbsolutePath().replace("TXT", "txt"); if(fileName[j].toLowerCase().indexOf("txt")>=0) { count++; String file[] = fileName[j].split(".txt"); String fileResult = file[file.length-1].replace("\\", "/,"); String fileCut[] = fileResult.split("/,"); fileName[j]=fileCut[fileCut.length-1]; System.out.println(fileName[j]); } } } //public static void main(String[] args) { // new ReadDoc(); //} } ไฟล์ SearchTest.java import java.io.File; import java.io.IOException; import java.util.List; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Scanner; import java.util.Set; import javax.swing.JOptionPane; import jxl.Workbook; import jxl.WorkbookSettings; import jxl.format.Alignment; import jxl.format.BorderLineStyle; import jxl.format.Colour; import jxl.format.VerticalAlignment; import jxl.write.Label; import jxl.write.WritableCellFormat; import jxl.write.WritableFont; import jxl.write.WritableSheet; import jxl.write.WritableWorkbook; import jxl.write.WriteException; class SearchTest{ SearchTest() throws IOException, WriteException{ ReadTxt rd = new ReadTxt(); String[][] strTest; int numHave=0; double sumWtd = 0.00; String haveSearch[] =new String[rd.getTxtFile().length]; //NumberFormat formatter = new DecimalFormat("#0.00"); java.text.DecimalFormat formatter = new java.text.DecimalFormat("0.00"); double wtd=0.00; strTest = new String[rd.getTxtFile().length][5000]; int txtLength[] = new int[rd.getTxtFile().length]; //JOptionPane.showMessageDialog(null, rd.getTxtFile().length); for(int i=0;i<rd.getTxtFile().length;i++){ String spTxt[] = rd.getTxtFile()[i].split(" "); txtLength[i]=spTxt.length; for(int j=0;j<spTxt.length;j++){ strTest[i][j]=spTxt[j].toLowerCase(); //System.out.println(strTest[i][j]); } } //String pos[]= new String[50000]; Map<String,Integer> mapSearch = new HashMap<String,Integer>(); //Map<Map,Integer> mapSearchAll = new HashMap<Map,Integer>(); int doc=0,sumSearch=0,word=0,allITF=0; double idf = 0.00; Scanner sc = new Scanner(System.in); System.out.println("Enter Search : "); String search = JOptionPane.showInputDialog(null,"Enter Search"); //String search = sc.nextLine().toLowerCase().trim(); String[] result = search.split(" "); int numSearch[] = new int[result.length]; List<String> list = Arrays.asList(result); Set<String> set = new HashSet<String>(list); String[] getSearch= new String[set.size()]; set.toArray(getSearch); int numWordPerDoc[][] = new int[rd.getTxtFile().length][getSearch.length]; double weight[][] = new double[getSearch.length][rd.getTxtFile().length]; double sumWeight[] = new double[getSearch.length]; double valueIdf[] = new double[getSearch.length]; double valueWf[][] = new double[rd.getTxtFile().length][getSearch.length]; Double wtq[][]= new Double[getSearch.length][rd.getTxtFile().length]; int chkMap=0; int tf[] = new int[getSearch.length]; int sumString[][] = new int[rd.getTxtFile().length][getSearch.length]; for(doc=0;doc<rd.getTxtFile().length;doc++){ mapSearch.clear(); for(word=0;word<txtLength[doc];word++){ if(mapSearch.get(strTest[doc][word])==null){ sumSearch=0; sumSearch++; mapSearch.put(strTest[doc][word].trim(),sumSearch); //JOptionPane.showMessageDialog(null, mapSearchAll.get(mapSearch)); } else { sumSearch=mapSearch.get(strTest[doc][word]); sumSearch++; mapSearch.put(strTest[doc][word].trim(),sumSearch); } chkMap=1; } for(int i=0;i<getSearch.length;i++){ if(null==mapSearch.get(getSearch[i])){ numWordPerDoc[doc][i] = 0; } else { allITF=allITF+mapSearch.get(getSearch[i]); numSearch[i]=numSearch[i]+1; numWordPerDoc[doc][i] = mapSearch.get(getSearch[i]); } System.out.println(rd.getFileName()[doc]+".txt > Have Term "+getSearch[i]+" is = "+numWordPerDoc[doc][i]); //valueDoc[doc][i]=numWordPerDoc[doc][i]; int chkHave=0; if(numHave>0) for(int k=0;k<numHave;k++){ if(rd.getFileName()[doc].equals(haveSearch[numHave-1])){ chkHave=1; } } if(numWordPerDoc[doc][i]!=0 && chkHave==0){ haveSearch[numHave]=rd.getFileName()[doc]; numHave++; } //System.out.println((doc+1)+":"+getSearch[i]+": DDDDDD ="+numWordPerDoc[doc][i]); } //sumString[doc][] =sumString[doc]+numWordPerDoc[doc][i]; } int sumTf=0; for(int i=0;i<getSearch.length;i++){ sumTf=0; for(doc=0;doc<rd.getTxtFile().length;doc++){ sumTf = sumTf+numWordPerDoc[doc][i]; } tf[i]=sumTf; } System.out.println("**********************************************************"); for(int i=0;i<getSearch.length;i++){ System.out.println("Term "+getSearch[i]); System.out.println("tf = "+tf[i]); System.out.println(getSearch[i]+" Have DF = "+numSearch[i]); double sum=0.0; String getIdf =""; if(0==numSearch[i]) idf=0.00; else{ getIdf=formatter.format(Math.log10(rd.getTxtFile().length/numSearch[i])); idf = Math.log10(rd.getTxtFile().length/numSearch[i]); } valueIdf[i]=idf; sumWtd=0.00; for(doc=0;doc<rd.getTxtFile().length;doc++){ wtq[i][doc]=0.00; wtq[i][doc]=numWordPerDoc[doc][i]*idf; sum=sum+wtq[i][doc]; System.out.println(rd.getFileName()[doc]+".TXT "+getSearch[i]+" have wt,q = "+formatter.format(wtq[i][doc])); //wtd = wtq[i][doc]/((Math.sqrt(Math.pow(2, wtq[i][doc]))) ); sumWtd=sumWtd+wtd; } System.out.println("idf "+getSearch[i]+" = "+formatter.format(idf)); for(int j=0;j<getSearch.length;j++){ weight[j][i]=idf*numWordPerDoc[i][j]; // sum=sum+weight[j][i]; //System.out.println("Weight of "+getSearch[i]+" = "+weight[j][i]+""); } sumWeight[i]=0.00; sumWeight[i]=sum; System.out.println("Score "+getSearch[i]+" = "+formatter.format(sumWeight[i])); System.out.println("**********************************************************"); } double netWeight=0.00; for(int i=0;i<getSearch.length;i++){ netWeight = +netWeight+sumWeight[i]; } System.out.println("Sum Score = "+formatter.format(netWeight)); File file = new File("search.xls"); WorkbookSettings wbSettings = new WorkbookSettings(); wbSettings.setUseTemporaryFileDuringWrite(true); wbSettings.setLocale(new Locale("en", "EN")); WritableWorkbook workbook = Workbook.createWorkbook(file, wbSettings); workbook.createSheet("Search", 0); WritableSheet excelSheet = workbook.getSheet(0); WritableCellFormat B_Green; WritableFont times8pt = new WritableFont(WritableFont.ARIAL,12,WritableFont.BOLD); Colour icb = Colour.LIGHT_GREEN; WritableCellFormat ch = new WritableCellFormat(times8pt); ch.setBackground(icb); ch.setAlignment(Alignment.CENTRE); ch.setVerticalAlignment(VerticalAlignment.CENTRE); ch.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN); Colour icb2 = Colour.LIGHT_GREEN; WritableCellFormat ch2 = new WritableCellFormat(); ch2.setBackground(icb2); ch2.setAlignment(Alignment.CENTRE); ch2.setVerticalAlignment(VerticalAlignment.CENTRE); ch2.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN); Colour icb3 = Colour.LIGHT_GREEN; WritableCellFormat ch3 = new WritableCellFormat(); ch3.setBackground(icb3); ch3.setAlignment(Alignment.CENTRE); ch3.setVerticalAlignment(VerticalAlignment.CENTRE); ch3.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN); Colour ctt = Colour.SKY_BLUE; WritableCellFormat tt = new WritableCellFormat(); tt.setBackground(ctt); tt.setAlignment(Alignment.CENTRE); tt.setVerticalAlignment(VerticalAlignment.CENTRE); tt.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN); Colour cww = Colour.ICE_BLUE; WritableCellFormat ww = new WritableCellFormat(); ww.setBackground(cww); ww.setAlignment(Alignment.CENTRE); ww.setVerticalAlignment(VerticalAlignment.CENTRE); ww.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN); int col=0; Label head; excelSheet.mergeCells(col, 3, col, 4); head = new Label(col++,3,"term",ch); excelSheet.addCell(head); excelSheet.mergeCells(col, 3, col, 4); head = new Label(col++,3,"tf",ch); excelSheet.addCell(head); excelSheet.mergeCells(col, 3, col, 4); head = new Label(col++,3,"df",ch); excelSheet.addCell(head); excelSheet.mergeCells(col, 3, col, 4); head = new Label(col++,3,"idf",ch); excelSheet.addCell(head); int colScore=0; for(int i=0;i<rd.getTxtFile().length*2;i++){ if(i%2==0){ excelSheet.mergeCells(col+i, 3, col+i+1, 3); head = new Label(col+i,3,"TXT "+rd.getFileName()[i/2],ch); excelSheet.addCell(head); head = new Label(col+i,4,"tf",tt); excelSheet.addCell(head); } else{ head = new Label(col+i,4,"wf",ww); excelSheet.addCell(head); } } colScore=col+rd.getTxtFile().length*2; for(int i=0;i<getSearch.length;i++){ head = new Label(0,5+i,getSearch[i],ch3); excelSheet.addCell(head); head = new Label(1,5+i,String.valueOf(tf[i]),ch3); excelSheet.addCell(head); head = new Label(2,5+i,String.valueOf(numSearch[i]),ch3); excelSheet.addCell(head); String sidf = formatter.format(valueIdf[i]); head = new Label(3,5+i,sidf,ch3); excelSheet.addCell(head); } for(int i=0 ;i<rd.getTxtFile().length;i++){ for(int j=0;j<getSearch.length;j++){ head = new Label(col+i*2,5+j,""+numWordPerDoc[i][j],ch2); excelSheet.addCell(head); } } for(int i=0;i<getSearch.length;i++){ for(int j=0 ;j<rd.getTxtFile().length;j++){ String swf = formatter.format(wtq[i][j]); head = new Label(col+j*2+1,5+i,swf,ch2); excelSheet.addCell(head); } } excelSheet.mergeCells(colScore, 3, colScore, 4); head = new Label(colScore,3,"score",ch); excelSheet.addCell(head); int posSumScore = 5+getSearch.length; for(int i=0;i<getSearch.length;i++){ String swe = formatter.format(sumWeight[i]); head = new Label(colScore,5+i,swe,ch); excelSheet.addCell(head); } head = new Label(colScore-1,posSumScore,"Sum Score",ch2); excelSheet.addCell(head); String net = formatter.format(netWeight); head = new Label(colScore,posSumScore,net,ch); excelSheet.addCell(head); try { Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler " +"search.xls"); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } workbook.write(); workbook.close(); System.out.println("Search Complete!!!"); } public static void main(String[] args) throws WriteException, IOException { new SearchTest(); } } โดยเราจะเก็บไฟล์ Text ไว้ที่ Floder "doc" ในโปรเจค /////////////////////////////////////// ////////////////////////////////////// ตัวอย่างผลลัพธ์ รายงานออกมาเป็น Excel ต้องมี jxl.jar ดาวโหลดได้ที่ http://www.findjar.com/jar/net.sourceforge.jexcelapi/jars/jxl-2.6.jar.html
สำหรับศึกษาเกี่ยวกับภาษา Java และกันลืมของกระผมเอง เพราะการทำโปรแกรมนั้นส่วนใหญ่ก็ไม่สามารถจำ คำสั่งต่างๆได้หมดและเพื่อเป็นความรู้กับบุคคลทั่วไป ใครมีโจทย์ปัญหาอะไรที่คิดไม่ออกก็โพสลงไว้ได้นะครับจะเช็ดดูทุกวัน
29 มี.ค. 2554
การทำ Search Engine จากไฟล์ Text ง่ายๆ
สมัครสมาชิก:
ส่งความคิดเห็น (Atom)
ไม่มีความคิดเห็น:
แสดงความคิดเห็น