29 มี.ค. 2554

การทำ Search Engine จากไฟล์ Text ง่ายๆ

หลักการ Search Engine เครดิต  http://www.narisa.com/forums/index.php?showtopic=29167

ไฟล์ ReadTxt.java

import java.io.BufferedInputStream;

import java.io.DataInputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import javax.swing.JOptionPane;

public class ReadTxt {

    String[] fileName;

    public String[] getFileName() {

        return fileName;

    }

    int count = 0,chCount=0,numCount=0;

    public int getCount() {

        return count;

    }

    String[] txtFile;

    ReadTxt(){

        SearchDoc();

        txtFile = new String[fileName.length];

        for(int count = 0;count<fileName.length;count++){

            File file = new File("doc/"+fileName[count]+".txt");

            FileInputStream fis = null;

            BufferedInputStream bis = null;

            DataInputStream dis = null;

            StringBuffer bufText = new StringBuffer();

   

            try {

              fis = new FileInputStream(file);

   

              // Here BufferedInputStream is added for fast reading.

              bis = new BufferedInputStream(fis);

              dis = new DataInputStream(bis);

   

              // dis.available() returns 0 if the file does not have more lines.

              while (dis.available() != 0) {

   

              // this statement reads the line from the file and print it to

                // the console.

                //a

                 // System.out.println(dis.readLine());

                bufText.append(dis.readLine());

              }

   

              // dispose all the resources after using them.

             // JOptionPane.showMessageDialog(null, ":"+bufText.toString()+":");

              if(bufText.toString().trim().equals(""))

                  chCount++;

              txtFile[count] =  bufText.toString();

              fis.close();

              bis.close();

              dis.close();

   

            } catch (FileNotFoundException e) {

              e.printStackTrace();

            } catch (IOException e) {

              e.printStackTrace();

            }

      }

        /*    int txtHave =0 ;

        for(int i=0;i<count;i++){

            if(!"".equals(txtFile[i])){

                fileName[txtHave]=fileName[i];

                txtFile[txtHave]=txtFile[i];

                txtHave++;

            }

        }

        //count-=chCount;

       

        StringBuffer buffKor = new StringBuffer();

        docList =new String[count];

        for(int count = 0;count<docList.length;count++){

        try{

            DataInputStream dos = new DataInputStream(new FileInputStream("doc/"+check[count]+".txt"));

            for(int i =1;i<=num;++i){

                buffKor.append(dos.readUTF());

                num++;

            }

            System.out.println("bon"+buffKor.toString());

        }

       

        catch(FileNotFoundException err){}

        catch(IOException err){}

   

        docList[count] = buffKor.toString();

        buffKor.delete(0, buffKor.length());

        }*/

    }

    public String[] getTxtFile() {

        return txtFile;

    }

    public void setTxtFile(String[] txtFile) {

        this.txtFile = txtFile;

    }

    public void SearchDoc(){

        File inputWorkbook = new File("doc");

        File[] num = inputWorkbook.listFiles();

        fileName = new String[num.length] ;

            for(int j =0;j<num.length;j++){

                fileName[j]=num[j].getAbsolutePath().replace("TXT", "txt");

                if(fileName[j].toLowerCase().indexOf("txt")>=0) {

                    count++;

                    String file[] = fileName[j].split(".txt");

                    String fileResult = file[file.length-1].replace("\\", "/,");

                    String fileCut[] = fileResult.split("/,");

                    fileName[j]=fileCut[fileCut.length-1];

                    System.out.println(fileName[j]);

                }

            }       

    }

    //public static void main(String[] args) {

    //    new ReadDoc();

    //}

}

ไฟล์ SearchTest.java

import java.io.File;

import java.io.IOException;

import java.util.List;

import java.util.Arrays;

import java.util.HashMap;

import java.util.HashSet;

import java.util.Locale;

import java.util.Map;

import java.util.Scanner;

import java.util.Set;

import javax.swing.JOptionPane;

import jxl.Workbook;

import jxl.WorkbookSettings;

import jxl.format.Alignment;

import jxl.format.BorderLineStyle;

import jxl.format.Colour;

import jxl.format.VerticalAlignment;

import jxl.write.Label;

import jxl.write.WritableCellFormat;

import jxl.write.WritableFont;

import jxl.write.WritableSheet;

import jxl.write.WritableWorkbook;

import jxl.write.WriteException;

class SearchTest{

   

    SearchTest() throws IOException, WriteException{

        ReadTxt rd = new ReadTxt();

        String[][] strTest;

        int numHave=0;

        double sumWtd = 0.00;

        String haveSearch[] =new String[rd.getTxtFile().length];

        //NumberFormat formatter = new DecimalFormat("#0.00");

        java.text.DecimalFormat formatter = new java.text.DecimalFormat("0.00");

        double wtd=0.00;

        strTest = new String[rd.getTxtFile().length][5000];

        int txtLength[] = new int[rd.getTxtFile().length];

        //JOptionPane.showMessageDialog(null, rd.getTxtFile().length);

        for(int i=0;i<rd.getTxtFile().length;i++){

            String spTxt[] = rd.getTxtFile()[i].split(" ");

            txtLength[i]=spTxt.length;

            for(int j=0;j<spTxt.length;j++){

                strTest[i][j]=spTxt[j].toLowerCase();

                //System.out.println(strTest[i][j]);

            }

        }

       

        //String pos[]= new String[50000];

        Map<String,Integer>  mapSearch = new HashMap<String,Integer>();

        //Map<Map,Integer> mapSearchAll = new HashMap<Map,Integer>();

        int doc=0,sumSearch=0,word=0,allITF=0;

        double idf = 0.00;

        Scanner sc = new Scanner(System.in);

        System.out.println("Enter Search : ");

        String search = JOptionPane.showInputDialog(null,"Enter Search");

        //String search = sc.nextLine().toLowerCase().trim();

        String[] result = search.split(" ");

        int numSearch[] = new int[result.length];

       

        List<String> list = Arrays.asList(result);       

        Set<String> set = new HashSet<String>(list);

        String[] getSearch= new String[set.size()];

        set.toArray(getSearch);  

        int numWordPerDoc[][] = new int[rd.getTxtFile().length][getSearch.length];

        double weight[][] = new double[getSearch.length][rd.getTxtFile().length];

        double sumWeight[] = new double[getSearch.length];

        double valueIdf[] = new double[getSearch.length];

        double valueWf[][] = new double[rd.getTxtFile().length][getSearch.length];

        Double wtq[][]= new Double[getSearch.length][rd.getTxtFile().length];

        int chkMap=0;

        int tf[] = new int[getSearch.length];

        int sumString[][] = new int[rd.getTxtFile().length][getSearch.length];

        for(doc=0;doc<rd.getTxtFile().length;doc++){

            mapSearch.clear();

            for(word=0;word<txtLength[doc];word++){

                if(mapSearch.get(strTest[doc][word])==null){

                        sumSearch=0;

                        sumSearch++;

                        mapSearch.put(strTest[doc][word].trim(),sumSearch);

                        //JOptionPane.showMessageDialog(null, mapSearchAll.get(mapSearch));

                }

                else {

                        sumSearch=mapSearch.get(strTest[doc][word]);

                        sumSearch++;

                        mapSearch.put(strTest[doc][word].trim(),sumSearch);

                }

                chkMap=1;

            }

            for(int i=0;i<getSearch.length;i++){

                if(null==mapSearch.get(getSearch[i])){

                    numWordPerDoc[doc][i] = 0;

                }

                else {

                    allITF=allITF+mapSearch.get(getSearch[i]);

                    numSearch[i]=numSearch[i]+1;

                    numWordPerDoc[doc][i] = mapSearch.get(getSearch[i]);

                }

                System.out.println(rd.getFileName()[doc]+".txt > Have Term "+getSearch[i]+" is = "+numWordPerDoc[doc][i]);

                //valueDoc[doc][i]=numWordPerDoc[doc][i];

                int chkHave=0;

                if(numHave>0)

                for(int k=0;k<numHave;k++){

                    if(rd.getFileName()[doc].equals(haveSearch[numHave-1])){

                        chkHave=1;

                    }

                }

                if(numWordPerDoc[doc][i]!=0 && chkHave==0){

                    haveSearch[numHave]=rd.getFileName()[doc];

                    numHave++;

                }

                //System.out.println((doc+1)+":"+getSearch[i]+": DDDDDD ="+numWordPerDoc[doc][i]);

            }

            //sumString[doc][] =sumString[doc]+numWordPerDoc[doc][i];

           

        }

        int sumTf=0;

        for(int i=0;i<getSearch.length;i++){

            sumTf=0;

            for(doc=0;doc<rd.getTxtFile().length;doc++){

                sumTf = sumTf+numWordPerDoc[doc][i];

            }

            tf[i]=sumTf;

        }

        System.out.println("**********************************************************");

        for(int i=0;i<getSearch.length;i++){

            System.out.println("Term "+getSearch[i]);

            System.out.println("tf = "+tf[i]);

            System.out.println(getSearch[i]+" Have DF = "+numSearch[i]);

            double sum=0.0;

            String getIdf ="";

            if(0==numSearch[i])

                idf=0.00;

            else{

                getIdf=formatter.format(Math.log10(rd.getTxtFile().length/numSearch[i]));

                idf = Math.log10(rd.getTxtFile().length/numSearch[i]);

            }

            valueIdf[i]=idf;

            sumWtd=0.00;

                for(doc=0;doc<rd.getTxtFile().length;doc++){

                    wtq[i][doc]=0.00;

                    wtq[i][doc]=numWordPerDoc[doc][i]*idf;

                    sum=sum+wtq[i][doc];

                    System.out.println(rd.getFileName()[doc]+".TXT "+getSearch[i]+" have wt,q = "+formatter.format(wtq[i][doc]));

                    //wtd = wtq[i][doc]/((Math.sqrt(Math.pow(2, wtq[i][doc]))) );

                    sumWtd=sumWtd+wtd;

                }

                System.out.println("idf "+getSearch[i]+" = "+formatter.format(idf));

               

                for(int j=0;j<getSearch.length;j++){

                    weight[j][i]=idf*numWordPerDoc[i][j];

                //    sum=sum+weight[j][i];

                    //System.out.println("Weight of "+getSearch[i]+" = "+weight[j][i]+"");

                }

                sumWeight[i]=0.00;

                sumWeight[i]=sum;

                System.out.println("Score "+getSearch[i]+" = "+formatter.format(sumWeight[i]));

                System.out.println("**********************************************************");

        }

        double netWeight=0.00;

        for(int i=0;i<getSearch.length;i++){

            netWeight = +netWeight+sumWeight[i];

        }

        System.out.println("Sum Score = "+formatter.format(netWeight));

        File file = new File("search.xls");

        WorkbookSettings wbSettings = new WorkbookSettings();

        wbSettings.setUseTemporaryFileDuringWrite(true);

        wbSettings.setLocale(new Locale("en", "EN"));

        WritableWorkbook workbook = Workbook.createWorkbook(file, wbSettings);

        workbook.createSheet("Search", 0);

        WritableSheet excelSheet = workbook.getSheet(0);

        WritableCellFormat  B_Green;

        WritableFont times8pt = new WritableFont(WritableFont.ARIAL,12,WritableFont.BOLD);

        Colour icb = Colour.LIGHT_GREEN;

        WritableCellFormat ch = new WritableCellFormat(times8pt);

        ch.setBackground(icb);

        ch.setAlignment(Alignment.CENTRE);

        ch.setVerticalAlignment(VerticalAlignment.CENTRE);

        ch.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN);

       

        Colour icb2 = Colour.LIGHT_GREEN;

        WritableCellFormat ch2 = new WritableCellFormat();

        ch2.setBackground(icb2);

        ch2.setAlignment(Alignment.CENTRE);

        ch2.setVerticalAlignment(VerticalAlignment.CENTRE);

        ch2.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN);

       

        Colour icb3 = Colour.LIGHT_GREEN;

        WritableCellFormat ch3 = new WritableCellFormat();

        ch3.setBackground(icb3);

        ch3.setAlignment(Alignment.CENTRE);

        ch3.setVerticalAlignment(VerticalAlignment.CENTRE);

        ch3.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN);

       

        Colour ctt = Colour.SKY_BLUE;

        WritableCellFormat tt = new WritableCellFormat();

        tt.setBackground(ctt);

        tt.setAlignment(Alignment.CENTRE);

        tt.setVerticalAlignment(VerticalAlignment.CENTRE);

        tt.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN);

       

        Colour cww = Colour.ICE_BLUE;

        WritableCellFormat ww = new WritableCellFormat();

        ww.setBackground(cww);

        ww.setAlignment(Alignment.CENTRE);

        ww.setVerticalAlignment(VerticalAlignment.CENTRE);

        ww.setBorder(jxl.format.Border.ALL, BorderLineStyle.THIN);

        int col=0;

        Label head;

        excelSheet.mergeCells(col, 3, col, 4);

        head = new Label(col++,3,"term",ch);

        excelSheet.addCell(head);

        excelSheet.mergeCells(col, 3, col, 4);

        head = new Label(col++,3,"tf",ch);

        excelSheet.addCell(head);

        excelSheet.mergeCells(col, 3, col, 4);

        head = new Label(col++,3,"df",ch);

        excelSheet.addCell(head);

        excelSheet.mergeCells(col, 3, col, 4);

        head = new Label(col++,3,"idf",ch);

        excelSheet.addCell(head);

        int colScore=0;

        for(int i=0;i<rd.getTxtFile().length*2;i++){

            if(i%2==0){

                excelSheet.mergeCells(col+i, 3, col+i+1, 3);

                head = new Label(col+i,3,"TXT "+rd.getFileName()[i/2],ch);

                excelSheet.addCell(head);

                head = new Label(col+i,4,"tf",tt);

                excelSheet.addCell(head);

            }

            else{

                head = new Label(col+i,4,"wf",ww);

                excelSheet.addCell(head);

            }

        }

        colScore=col+rd.getTxtFile().length*2;

        for(int i=0;i<getSearch.length;i++){

            head = new Label(0,5+i,getSearch[i],ch3);

            excelSheet.addCell(head);

            head = new Label(1,5+i,String.valueOf(tf[i]),ch3);

            excelSheet.addCell(head);

            head = new Label(2,5+i,String.valueOf(numSearch[i]),ch3);

            excelSheet.addCell(head);

            String sidf = formatter.format(valueIdf[i]);

            head = new Label(3,5+i,sidf,ch3);

            excelSheet.addCell(head);

        }

        for(int i=0 ;i<rd.getTxtFile().length;i++){

            for(int j=0;j<getSearch.length;j++){

                head = new Label(col+i*2,5+j,""+numWordPerDoc[i][j],ch2);

                excelSheet.addCell(head);

            }

        }

        for(int i=0;i<getSearch.length;i++){

            for(int j=0 ;j<rd.getTxtFile().length;j++){

                String swf = formatter.format(wtq[i][j]);

                head = new Label(col+j*2+1,5+i,swf,ch2);

                excelSheet.addCell(head);

            }

        }

        excelSheet.mergeCells(colScore, 3, colScore, 4);

        head = new Label(colScore,3,"score",ch);

        excelSheet.addCell(head);

        int posSumScore = 5+getSearch.length;

        for(int i=0;i<getSearch.length;i++){

            String swe = formatter.format(sumWeight[i]);

            head = new Label(colScore,5+i,swe,ch);

            excelSheet.addCell(head);

        }

        head = new Label(colScore-1,posSumScore,"Sum Score",ch2);

        excelSheet.addCell(head);

        String net = formatter.format(netWeight);

        head = new Label(colScore,posSumScore,net,ch);

        excelSheet.addCell(head);

        try {

            Runtime.getRuntime().exec("rundll32 url.dll,FileProtocolHandler " +"search.xls");

        } catch (IOException e1) {

            // TODO Auto-generated catch block

            e1.printStackTrace();

        }

        workbook.write();

        workbook.close();

       System.out.println("Search Complete!!!");

   

    }

   

    public static void main(String[] args) throws WriteException, IOException {

        new SearchTest();

    }

}

โดยเราจะเก็บไฟล์ Text ไว้ที่ Floder "doc" ในโปรเจค

///////////////////////////////////////

//////////////////////////////////////

ตัวอย่างผลลัพธ์

รายงานออกมาเป็น Excel ต้องมี jxl.jar ดาวโหลดได้ที่

http://www.findjar.com/jar/net.sourceforge.jexcelapi/jars/jxl-2.6.jar.html

ไม่มีความคิดเห็น:

แสดงความคิดเห็น