使用lingpipe自然语言处理包进行文本分类/** * 使用 lingpipe的tf/idf分类器训练语料 * * @author laigood */ public class trai

/**
 * 使用 lingpipe的tf/idf分类器训练语料
 * 
 * @author laigood
 */
public class traintclassifier {
//训练语料文件夹
private static file tdir = new file("f:\\data\\category");
//定义分类
private static string[] categories = { "金融", "军事", "医学", "饮食" };
public static void main(string[] args) throws classnotfoundexception,
ioexception {

tfidfclassifiertrainer classifier = new tfidfclassifiertrainer(
new tokenfeatureextractor(charactertokenizerfactory.instance));
// 开始训练
for (int i = 0; i < categories.length; i++) {
file classdir = new file(tdir, categories[i]);
if (!classdir.isdirectory()) {
system.out.println("不能找到目录=" + classdir);
}
// 训练器遍历分类文件夹下的所有文件
for (file file : classdir.listfiles()) {
string text = files.readfromfile(file, "utf-8");
system.out.println("正在训练 " + categories[i] + file.getname());
classification classification = new classification(
categories[i]);
classified classified = new classified(
text, classification);
classifier.handle(classified);

}

// 把分类器模型写到文件上
system.out.println("开始生成分类器");
string modelfile = "f:\\data\\category\\tclassifier";
objectoutputstream os = new objectoutputstream(new fileoutputstream(
modelfile));
classifier.compileto(os);
os.close();

system.out.println("分类器生成完成");
}
}

 * @author laigood
 */
public class testtclassifier {
//测试语料的存放目录
private static file tdir = new file("f:\\data\\test");
private static string[] categories = { "金融", "军事", "医学", "饮食" };
public static void main(string[] args) throws classnotfoundexception {

//分类器模型存放地址
string modelfile = "f:\\data\\category\\tclassifier";
scoredclassifier compiledclassifier = null;
try {
objectinputstream oi = new objectinputstream(new fileinputstream(
modelfile));
compiledclassifier = (scoredclassifier) oi
.readobject();
oi.close();
} catch (ioexception ie) {
system.out.println("io error: model file " + modelfile + " missing");
}
// 遍历分类目录中的文件测试分类准确度
confusionmatrix confmatrix = new confusionmatrix(categories);
numberformat nf = numberformat.getinstance();
nf.setmaximumintegerdigits(1);
nf.setmaximumfractiondigits(3);
for (int i = 0; i < categories.length; ++i) {
file classdir = new file(tdir, categories[i]);
//对于每一个文件,通过分类器找出最适合的分类
for (file file : classdir.listfiles()) {
string text = "";
try {
text = files.readfromfile(file, "utf-8");
} catch (ioexception ie) {
system.out.println("不能读取 " + file.getname());
}
system.out.println("测试 " + categories[i]
+ file.separator + file.getname());
scoredclassification classification = compiledclassifier
.classify(text.subsequence(0, text.length()));
confmatrix.increment(categories[i],
classification.bestcategory());
system.out.println("最适合的分类: "
+ classification.bestcategory());


system.out.println("--------------------------------------------");
system.out.println("- 结果 ");
system.out.println("--------------------------------------------");
int[][] imatrix = confmatrix.matrix();
stringbuffer sb = new stringbuffer();
sb.append(stringtools.fillin("category", 10, true, ' '));
for (int i = 0; i < categories.length; i++)
sb.append(stringtools.fillin(categories[i], 8, false, ' '));
system.out.println(sb.tostring());
for (int i = 0; i < imatrix.length; i++) {
sb = new stringbuffer();
sb.append(stringtools.fillin(categories[i], 10, true, ' ',
10 - categories[i].length()));
for (int j = 0; j < imatrix.length; j++) {
string out = "" + imatrix[i][j];
sb.append(stringtools.fillin(out, 8, false, ' ',
8 - out.length()));
}
system.out.println(sb.tostring());
}
system.out.println("准确度: "
+ nf.format(confmatrix.totalaccuracy()));
system.out.println("总共正确数 : " + confmatrix.totalcorrect());
system.out.println("总数:" + confmatrix.totalcount());
}
}


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部