HDFS_FileSystem 类介绍

FileSystem api:http://hadoop.apache.org/docs/current/api/org/apache/hadoop/fs/FileSystem.html 

FileStatus 类:https://blog.csdn.net/qq_40794973/article/details/88064201 

HDFS的API操作:https://blog.csdn.net/qq_40794973/article/details/86713917#t11


LocatedFileStatus 是 FileStatus 的子类

import org.apache.hadoop.fs.FileStatus;

常用方法举例

listFiles() 方法:

 /*** List the statuses and block locations of the files in the given path.* * If the path is a directory, *   if recursive is false, returns files in the directory;*   if recursive is true, return files in the subtree rooted at the path.* If the path is a file, return the file's status and block locations.* * @param f is the path* @param recursive if the subdirectories need to be traversed recursively** @return an iterator that traverses statuses of the files** @throws FileNotFoundException when the path does not exist;*         IOException see specific implementation*/public RemoteIterator listFiles(final Path f, final boolean recursive)throws FileNotFoundException, IOException {......省略...}	
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
/*** 获取 FileSystem 对象*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {Configuration configuration = new Configuration();return FileSystem.get(new URI(uri), configuration, user);
}
/*** 遍历所有的文件* 文件详情查看*/
@Test
public void test8() throws IOException, InterruptedException, URISyntaxException {// 1 获取文件系统FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");//2.获取文件详情//RemoteIterator  集合上的迭代器,其元素需要远程获取RemoteIterator listFiles = fs.listFiles(new Path("/"), true);//第二个参数为 true 表示递归遍历while(listFiles.hasNext()){LocatedFileStatus status = listFiles.next();//LocatedFileStatus 的父类是 FileStatus  // 输出详情// 文件名称System.out.println("文件名称: "+status.getPath().getName());// 路径System.out.println("路径: "+status.getPath());// 长度System.out.println("长度: "+status.getLen());// 权限System.out.println("权限: "+status.getPermission());// 分组System.out.println("分组: "+status.getGroup());// 获取存储的块信息BlockLocation[] blockLocations = status.getBlockLocations();for (BlockLocation blockLocation : blockLocations) {// 获取块存储的主机节点String[] hosts = blockLocation.getHosts();for (String host : hosts) {System.out.println("存储的主机节点: "+host);}}System.out.println("------------------------------------------------------------");}//3.关闭资源fs.close();
}

输出:

文件名称: a.txt
路径: hdfs://hadoop102:9000/a.txt
长度: 25
权限: rw-rw-rw-
分组: atguigu
存储的主机节点: hadoop103
存储的主机节点: hadoop102
存储的主机节点: hadoop104
------------------------------------------------------------


 listStatus() 方法:

public FileStatus[] listStatus(Path f, PathFilter filter) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
public abstract FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException;
private void listStatus(ArrayList results, Path f,PathFilter filter) throws FileNotFoundException, IOException

 

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Timestamp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/*** 获取 FileSystem 对象*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {Configuration configuration = new Configuration();return FileSystem.get(new URI(uri), configuration, user);
}/*** 递归遍历 hdfs 文件系统*filestatus 获取文件状态*/
@Test
public void test9() throws IOException, InterruptedException, URISyntaxException {// 1 获取文件系统FileSystem fs = getFileSystem("hdfs://hadoop102:9000", "atguigu");Path path = new Path("/");listFilesStatus(path,fs);//3.关闭资源fs.close();
}public void listFilesStatus(Path path, FileSystem hdfs) throws IOException, InterruptedException, URISyntaxException {FileStatus[] files = hdfs.listStatus(path);for (int i = 0; i 

 -----------------这是文件--------------------------
文件长度: 444
文件路径: hdfs://hadoop102:9000/hehe.txt
文件名称: hehe.txt
文件父路径: hdfs://hadoop102:9000/
文件最后修改时间: 2019-03-01 04:48:48.28
文件块大小: 134217728
文件所属组: supergroup
文件拥有者: atguigu
该文件上次访问时间: 1551386928141
文件副本数: 2
-------------------------------------------
--这是文件夹--
文件父路径: hdfs://hadoop102:9000/test

 

 

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.junit.Test;import java.io.IOException;
import java.net.URISyntaxException;
@Test
public void test() throws IOException, InterruptedException {Configuration conf = new Configuration();// 配置在集群上运行//1 获取hdfs客户端对象FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");FileStatus[] fileStatuses = fs.listStatus(new Path("/test.txt"));Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组//    for (FileStatus status: fileStatuses) {//        System.out.println(status.getPath().getName());//    }System.out.println("--------------");for (Path path: paths) {System.out.println(path);}
}
import org.apache.hadoop.fs.FileUtil;

注:Hadoop 的 FileUtil 中 stat2Paths()方法用于把一个FileStatus对象数组转换为一个Path数组。

PathFilter 用户过滤

@Test
public void test() throws IOException {Configuration conf = new Configuration();// 配置在集群上运行//1 获取hdfs客户端对象FileSystem fs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf,"atguigu");//过滤出后缀为 .txt 的文件/*FileStatus[] fileStatuses = fs.listStatus(new Path("/"), new PathFilter() {@Overridepublic boolean accept(Path path) {return path.getName().endsWith(".txt");}});*/FileStatus[] fileStatuses = fs.listStatus(new Path("/"), path -> path.getName().endsWith(".txt"));Path[] paths = FileUtil.stat2Paths(fileStatuses);//把FileStatus对象数组转换为一个Path数组for (Path path : paths) {System.out.println(path);}
}

globStatus() 方法:(用于过滤)

这里的 Path 里面是可以写通配符的,比如 Path path = new Path("/*");

通配符及其含义
* 匹配0到多个字符
匹配单一字符
[ab]匹配{a,b}集合中的一个字符
[^ab]   匹配非{a,b}集合里的一个字符
[a-b]匹配在{a,b}范围内的一个字符
[^a-b] 匹配非{a,b}范围内的一个字符
{a,b}匹配包含a或b中的一个字符
\c匹配元字符c

 

 

找出 / 目录下所有以 .txt 结尾的文件和目录,并且文件名包含 a 

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
/*** 获取 FileSystem 对象*/
public FileSystem getFileSystem(String uri, String user) throws IOException, InterruptedException, URISyntaxException {Configuration configuration = new Configuration();return FileSystem.get(new URI(uri), configuration, user);
}
/*** globStatus 路径过滤*  我们需要得到和 / 同级中的带有hehe的路径 ,或者文件* globStatus 很灵活,内部甚至可以写一些正则表达式,有时候在处理大数据的预处理的时候可能很有效*/
@Test
public void test10() throws IOException, InterruptedException, URISyntaxException {// 1 获取文件系统FileSystem hdfs = getFileSystem("hdfs://hadoop102:9000", "atguigu");Path path = new Path("/*");//正则//	FileStatus[] fileGlobStatuses = hdfs.globStatus(path,new PathFilter() {
//		@Override
//		public boolean accept(Path x) {
//			// 过滤出路径中包含 hehe字符串 的路径
//			return  x.toString().contains("hehe");
//		}
//	});FileStatus[] fileGlobStatuses = hdfs.globStatus(path,(x)->x.getName().contains("a"));if(fileGlobStatuses != null) {//不为空就输出Path[] globPaths = FileUtil.stat2Paths(fileGlobStatuses);for (Path p :globPaths){System.out.println("globe过滤后的路径"+p);}}else {System.out.println("没有找到对应的目录或者文件");}
}

globe过滤后的路径hdfs://hadoop102:9000/hehe
globe过滤后的路径hdfs://hadoop102:9000/hehe.txt

找出目录树前三层里面后缀为 .txt 并且文件名以a开头的文件和目录

import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {Configuration conf = new Configuration();// 1 获取文件系统FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");Path path = new Path("/*/*/*/a*.txt");//正则表达式FileStatus[] status =  hdfs.globStatus(path);if(status != null) {//不为空就输出for(FileStatus p:status){if (hdfs.exists(p.getPath())) {System.out.println(p.getPath());}}}else {System.out.println("没有找到对应的a*.txt文件");}
}

hdfs://hadoop102:9000/a.txt
hdfs://hadoop102:9000/aa.txt
hdfs://hadoop102:9000/aaaaa.txt
hdfs://hadoop102:9000/aab.txt

打印出前四层所有的文件和目录

@Test
public void test11() throws IOException, InterruptedException, URISyntaxException {Configuration conf = new Configuration();// 1 获取文件系统FileSystem hdfs = FileSystem.get(URI.create("hdfs://hadoop102:9000"),conf, "atguigu");Path path = new Path("/*/*/*/*");//正则表达式FileStatus[] status =  hdfs.globStatus(path);if(status != null) {//不为空就输出for(FileStatus p:status){if (hdfs.exists(p.getPath())) {System.out.println(p.getPath());}}}else {System.out.println("没有文件");}
}

注:前面代码里面的else永远执行不到,及时文件没有他也不为null。

//	hdfs自身提供了许多filter,在hadoop权威指南中,提供一种 正则表达式filter的实现
public class RegexExcludePathFilter implements PathFilter {private  String regex;public RegexExcludePathFilter(String regex) {this.regex = regex;}@Overridepublic boolean accept(Path path) {return !path.toString().matches(regex);}
}

 

 


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部