When I was curious to learn more about Rust, I found The Rust Programing Language
to be an excellent learning resource. After finishing the book, I wanted to complete a small
project to practice what I had learned. Whenever I
try to come up with project ideas, I make a list of programs I can actually use in my daily
life. In my Master's degree program I have downloaded countless
SDKs, frameworks, CLI tools, etc. Most of these were really only useful or necessary in the
context of a single project or course, and afterwards
they continue to take up quite a bit of disk space. My project idea was to write a CLI tool
that
would accept a path to a local directory as an
argument and print to std::out
the ten largest files present within that
directory
or any
subdirectories.
I have completed this project in Rust, Go, C++, and C. What follows is my code for each solution and reflections on what I have learned. The problem I have used each of these languages to solve is the following:
std::out
along with their size in bytes.
Working in C always makes me appreciate the data structures implemented within other languages standard libraries. As C does not have a built-in dynamically allocated array data structure, I used the opportunity to work with dynamic memory allocation and grow an array as necessary.
// initial size of array used to contain filesystem entries
size_t FS_INFO_ARR_SIZE = 100;
// number of largest entries to output
const int NUM_ENTRIES = 10;
/*
A struct to contain the name of a filesystem entry and its size in bytes.
An array of this type will be used to catalog all filesystem entries for
the directory specified as command line argument.
*/
typedef struct FS_Info {
char name[1024];
long long size;
} FS_Info ;
// global pointer to current FS_Info array
// will be updated as the array is resized using dynamic memory allocation
FS_Info *curr_fs_info_ptr;
// used to sort fs_entries array descending in terms of entry size
static int compare(const void *a, const void *b)
{
const struct FS_Info *entryA = (FS_Info *)a;
const struct FS_Info *entryB = (FS_Info *)b;
return (entryB->size - entryA->size) - (entryA->size - entryB->size);
}
stat()
method may be called on the entry at this
path.
stat
structure defined in sys/stat.h
. This method is
used
to obtain the size in bytes
of filesystem entries on POSIX systems (it won't work on Windows).
void update_fs_info_arr(char *path)
{
static int items_added = 0;
struct stat st;
if (stat(path, &st) == 0)
{
if (items_added < FS_INFO_ARR_SIZE) // if array capacity will not be exceeded
{
strncpy(curr_fs_info_ptr[items_added].name, path, sizeof(curr_fs_info_ptr[items_added].name) - 1);
curr_fs_info_ptr[items_added].size = st.st_size;
items_added++;
}
else // double the size of the containing array
{
FS_INFO_ARR_SIZE *= 2;
FS_Info *resized_fs_entries = realloc(curr_fs_info_ptr, FS_INFO_ARR_SIZE * sizeof(*curr_fs_info_ptr));
if (resized_fs_entries)
{
curr_fs_info_ptr = resized_fs_entries;
strncpy(curr_fs_info_ptr[items_added].name, path, sizeof(curr_fs_info_ptr[items_added].name) - 1);
curr_fs_info_ptr[items_added].size = st.st_size;
items_added++;
}
else
{
perror("An error occurred when attempting to resize the array!");
exit(EXIT_FAILURE);
}
}
}
else
{
printf("Error getting stat for entry %s: %d\n", path, stat(path, &st));
}
}
dirent
structure is defined in dirent.h
and is used to
access the provided
directory in order to examine all filesystem entries contained therein.
.
or the
parent
directory ..
are not
recursed upon, as this results in an infinite loop.
update_fs_info_arr()
function.
path_to_entry
is defined as having a size
PATH_MAX
.
This size is platform defined, and requires including the limits
header
file.
void walk(const char *currDir)
{
DIR *dir = opendir(currDir);
struct dirent *entry;
if (dir == NULL)
{
printf("%s could not be opened", currDir);
return;
}
while ((entry = readdir(dir)) != NULL)
{
// if directory is current dir or parent dir
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
{
continue;
}
char path_to_entry[PATH_MAX];
snprintf(path_to_entry, sizeof(path_to_entry) - 1, "%s/%s", currDir, entry->d_name);
path_to_entry[sizeof(path_to_entry) - 1] = '\0';
// use path_to_entry to call stats on the entry
update_fs_info_arr(path_to_entry); // fs_entries will only be used on first call
if (entry->d_type == DT_DIR)
{
// recursively visit subdirectories
walk(path_to_entry);
}
}
closedir(dir);
}
startTime
variable to benchmark this
implementation against
a statically allocated array implementation described below.
calloc()
is used to allocate memory for and initialize an array of
FS_INFO
structs. A pointer
to the start of this array will be stored in the variable *fs_entries
.
curr_fs_info_ptr
is assigned to this value returned
by
calloc()
.
FS_INFO
struct and added to the array
as described above, or else recursed upon if they are subdirectories.
qsort()
. This function
expects arguments corresponding to the array of FS_INFO
structs to be
sorted, the size size of the array, and the
comparison function described above.
int main(int argc, char *argv[])
{
float startTime = (float)clock()/CLOCKS_PER_SEC;
// a char array to hold a filesystem path
char target_dir[PATH_MAX];
if (argc > 2)
{ // more than one argument passed at invocation
char *error_message = "Usage: %s \n", argv[0];
fprintf(stderr, "%s", error_message);
return EXIT_FAILURE;
}
else if (argc == 1)
{ // no argument passed at invocation, default to current working directory
if (getcwd(target_dir, sizeof(target_dir)) != NULL)
{
printf("Defaulting to current directory\n");
}
else
{
perror("Unable to detect current working directory, try passing a directory as command line argument");
exit(EXIT_FAILURE);
}
}
else
{ // exactly one path specified as argument at invocation
strncpy(target_dir, argv[1], PATH_MAX);
};
printf("Finding %d largest files in: %s\n", NUM_ENTRIES, target_dir);
/* Create a pointer to the start of the FS_Info array and set the global
variable curr_fs_info_ptr to this memory address
*/
FS_Info *fs_entries = calloc(FS_INFO_ARR_SIZE, sizeof(*fs_entries));
if (!fs_entries)
{
fprintf(stderr, "Malloc of fs_entries failed in main\n");
return EXIT_FAILURE;
}
curr_fs_info_ptr = fs_entries;
// recursively visit all entries in the specified directory
walk(target_dir);
// sort the entries descending by file size
qsort(curr_fs_info_ptr, FS_INFO_ARR_SIZE, sizeof(*curr_fs_info_ptr), compare);
float endTIme = (float)clock()/CLOCKS_PER_SEC;
printf("Program completed in %f seconds", endTIme - startTime);
// output ten largest files found
for (int i = 0; i < NUM_ENTRIES; i++)
{
printf("%s\t%lld\n", curr_fs_info_ptr[i].name, curr_fs_info_ptr[i].size);
}
free(curr_fs_info_ptr); // good practice, but program is over and OS will reclaim anyways
return EXIT_SUCCESS;
}
// dynamic memory allocation averages 0.25 seconds
FS_INFO
array
and it will not be updated.
FS_Info
structure is used to contain information about
filesystem entries.
// size of array used to contain filesystem entries
const size_t FS_INFO_ARR_SIZE = 10;
/*
A struct to contain the name of a filesystem entry and its size in bytes.
An array of this type will be used to catalog all filesystem entries for
the directory specified as command line argument.
*/
typedef struct FS_Info
{
char name[PATH_MAX];
long long size;
} FS_Info;
// global pointer to FS_Info array
FS_Info fs_info_arr[FS_INFO_ARR_SIZE];
// used to sort fs_entries array descending in terms of entry size
static int compare(const void *a, const void *b)
{
const struct FS_Info *entryA = (FS_Info *)a;
const struct FS_Info *entryB = (FS_Info *)b;
return (entryB->size - entryA->size) - (entryA->size - entryB->size);
}
FS_Info
array
and determine at which index resides the struct having the smallest size member.
long long
type to handle any large integers that may arise.
/*
Iterates over an array of FS_Info structs and returns a pointer to the struct
having the smallest size member.
*/
FS_Info *get_smallest_entry(FS_Info *entries)
{
long long smallest = entries[0].size;
FS_Info *target = &entries[0];
for (int i = 1; i < FS_INFO_ARR_SIZE; i++)
{
if (entries[i].size < smallest)
{
smallest = entries[i].size;
target = &entries[i];
}
}
return target;
}
/*
Add entires to the array. If the array is full, use the above function to find the
struct having the smallest file size, and if the current file size is larger, replace it.
*/
void update_fs_info_arr(char *path)
{
static int items_added = 0;
struct stat st;
if (stat(path, &st) == 0)
{
if (items_added < FS_INFO_ARR_SIZE) // if array capacity will not be exceeded
{
strncpy(fs_info_arr[items_added].name, path, PATH_MAX);
fs_info_arr[items_added].size = st.st_size;
items_added++;
}
else
// find entry having the smallest size and replace it with the current entry if it is larger
{
FS_Info *smallest = get_smallest_entry(fs_info_arr);
if (st.st_size > smallest->size)
{
strncpy(smallest->name, path, PATH_MAX);
smallest->size = st.st_size;
}
}
}
else
{
printf("Error getting stat for entry %s: %d\n", path, stat(path, &st));
}
}
walk()
function is almost identical to the dynamically allocated
implementation.
char[]path_to_entry
is modified to contain the next directory to traverse.
strcpy()
snprintf()
and
buffer sizing.
void walk(const char *currDir)
{
DIR *dir = opendir(currDir);
struct dirent *entry;
if (dir == NULL)
{
printf("%s could not be opened", currDir);
return;
}
while ((entry = readdir(dir)) != NULL)
{
// if directory is current dir or parent dir
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
{
continue;
}
char path_to_entry[PATH_MAX];
snprintf(path_to_entry, sizeof(path_to_entry), "%s/%s", currDir, entry->d_name);
update_fs_info_arr(path_to_entry);
if (entry->d_type == DT_DIR)
{
walk(path_to_entry);
}
}
closedir(dir);
}
main()
function hasn't significantly changed.
qsort()
and the comparison function defined
above
are once again used.
std::out
and the
program
exits.
int main(int argc, char *argv[])
{
float startTime = (float)clock()/CLOCKS_PER_SEC;
// a char array to hold a filesystem path
char target_dir[PATH_MAX];
if (argc > 2)
{ // more than one argument passed at invocation
char *error_message = "Usage: %s \n", argv[0];
fprintf(stderr, "%s", error_message);
return EXIT_FAILURE;
}
else if (argc == 1)
{ // no argument passed at invocation, default to current working directory
if (getcwd(target_dir, sizeof(target_dir)) != NULL)
{
printf("Defaulting to current directory\n");
}
else
{
perror("Unable to detect current working directory, try passing a directory as command line argument");
exit(EXIT_FAILURE);
}
}
else
{ // exactly one path specified as argument at invocation
strncpy(target_dir, argv[1], PATH_MAX);
};
strncpy(target_dir, argv[1], PATH_MAX);
printf("Finding the %zu largest files in: %s\n", FS_INFO_ARR_SIZE, target_dir);
// recursively visit all entries in the specified directory
walk(target_dir);
// sort the entries descending by file size
qsort(fs_info_arr, FS_INFO_ARR_SIZE, sizeof(*fs_info_arr), compare);
float endTime = (float)clock()/CLOCKS_PER_SEC;
printf("Program completed in %f seconds\n", endTime - startTime);
// output ten largest files found
for (int i = 0; i < FS_INFO_ARR_SIZE; i++)
{
printf("%s\t%lld\n", fs_info_arr[i].name, fs_info_arr[i].size);
}
return EXIT_SUCCESS;
}
// static allocation averages 0.2 seconds
Compared to the implementations using C, performing the same task using C++ is significantly
simpler. This
is thanks in large part to the standard library methods and data structures available in
C++. Of
note
in this implementation are the std::vector
and std::pair
data
structures, the
std::fs::filesystem::recursive_directory_iterator()
method, and the
std::sort()
method.
Iterating over a filesystem directory is such a common task that a method to do so
recursively
exists within the
filesystem
header file. This results in a much simpler implementation.
std::sort()
in order
to sort a vector of pairs in descending order by the second member of the pair, in
this case the size of a file.
// function to pass to std::sort in order to sort a vector of pairs by second item in descending order
bool sort_by_desc(const std::pair &a, const std::pair &b)
{
return a.second > b.second;
}
/*
Recursively visit all filesystem entries at the provided path, and if the entry is a regular file,
add its file name and file size to a vector of pairs. If the entry is a directory, this
function will be called recursively.
*/
void analyze_fs(std::vector> &entry_vec, std::string full_path)
{
for (const auto &entry : std::__fs::filesystem::recursive_directory_iterator(full_path))
{
if (entry.is_regular_file())
{
std::pair new_entry = std::make_pair(entry.path(), entry.file_size());
entry_vec.push_back(new_entry);
}
}
}
analyze_fs()
.
std::out
.
int main(int argc, char **argv)
{
float startTime = (float)clock() / CLOCKS_PER_SEC;
std::string full_path;
if (argc > 2) // erroneous command line arguments passed
{
std::cerr << "Usage: " << argv[0] << " path_to_directory" << std::endl;
return EXIT_FAILURE;
}
else if (argc == 1) // no path provided as argument
{
std::__fs::filesystem::path cwd = std::__fs::filesystem::current_path();
if (!cwd.empty())
{
std::cout << "Defaulting to current directory" << std::endl;
full_path = cwd;
}
}
else // exactly one path provided as command line argument
{
full_path = argv[1];
}
// holds the values of files and their sizes found by analyze_fs()
std::vector > entries;
analyze_fs(entries, full_path);
std::sort(entries.begin(), entries.end(), sort_by_desc);
float endTIme = (float)clock() / CLOCKS_PER_SEC;
// output the ten largest files found
if (entries.size() > 10)
{
for (int i = 0; i < 10; i++)
{
std::cout << entries[i].first << "\t" << entries[i].second << std::endl;
}
}
else
for (int i = 0; i < entries.size(); i++)
{
std::cout << entries[i].first << "\t" << entries[i].second << std::endl;
}
printf("\nProgram completed in %f seconds", endTIme - startTime);
return EXIT_SUCCESS;
}
// program completes in 0.002 seconds
Previously I have really enjoyed using Go for developing APIs and back-end web servers. Some of my favorite features are its built in memory management, its approach to error handling, a robust standard library, an active community with tons of useful public packages, and its excellent documentation. I had not previously created a CLI application using Go, and found this a perfect opportunity to do so.
main()
.
WalkFunc
type and WalkDir
function are defined in the
path/filesystem
package, provided
by the Standard Library.
walk()
expects arguments describing the path to a directory, an
fs.DirEntry
which is known in Go as an interface, an error value which is
passed implicitly, and a pointer to a slice of EntryInfo
structs.
EntryInfo
structs.
/*
A struct to hold a key value EntryInfo corresponding to single member of a
map[string]uint32. This will be used to sort the map data by value.
*/
type EntryInfo struct {
name string
size uint32
}
const NUM_ENTRIES = 10
// Implement a WalkDirFunc that accepts a reference to a slice of EntryInfo
func walk(path string, entry fs.DirEntry, err error, pairs *[]EntryInfo) error {
if err != nil {
return err
}
if !entry.IsDir() {
// add entry and its size to EntryInfo slice
file, err := os.Stat(path)
if err != nil {
log.Println("Error determining file size:", err)
}
newPair := EntryInfo{path, uint32(file.Size())}
*pairs = append(*pairs, newPair)
}
return nil
}
fullPath
.
EntryInfo
structs is declared that will be used to keep
track of
discovered files.
walk()
function defined above are passed to an
invocation of filepath.WalkDir()
which handles the logic of recursively traversing the specified directory.
walk()
definition defines what work will be performed here.
EntryInfo
structs are sorted in
descending order and output to the user.
walk()
.
walk()
and the append()
method
expects an argument describing the slice to be appended to. Since slices are
technically
passed by value in Go, it is not possible to
pass a slice to a function in Go (the result being a copy of the passed slice) and
then
expect the append()
method to function correctly.
&
operator similar to C and C++.
func main() {
// string representing directory to analyze
var fullPath = ""
args := os.Args
if len(args) > 2 {
msg := fmt.Sprintf("Usage: %v ", args[0])
log.Fatal(msg)
} else if len(args) == 1 {
cwd, err := os.Executable()
if err != nil {
log.Fatal("Unable to infer current working directory, try providing a path as program argument")
}
fullPath = filepath.Dir(cwd)
log.Println("No path specified, defaulting to current working directory")
} else {
fullPath = args[1]
}
fmt.Printf("Searching for %v largest files in %v\n\n", NUM_ENTRIES, fullPath)
programStart := time.Now()
// Declare a slice of pairs to hold entry information
var pairs []EntryInfo
// Recursively visit all filesystem entries at the provided path
filepath.WalkDir(fullPath, func(path string, entry fs.DirEntry, err error) error {
return walk(path, entry, err, &pairs)
})
// Sort descending
sort.Slice(pairs, func(a, b int) bool {
return pairs[a].size > pairs[b].size
})
programDuration := time.Since(programStart).Seconds()
// If the provided path had fewer than NUM_ENTRIES, print all entries and their sizes in bytes
if len(pairs) < NUM_ENTRIES {
for _, file := range pairs {
fmt.Printf("%s, %d\n", file.name, file.size)
}
} else { // print only the largest NUM_ENTRIES and their sizes in bytes
for i := 0; i < NUM_ENTRIES; i++ {
fmt.Printf("%s: %d\n", pairs[i].name, pairs[i].size)
}
}
fmt.Printf("\nProgram completed in %v seconds.\n", programDuration)
}
// program completes in 0.26 seconds on average
This is the project that started it all. Are you curious how the language voted by the StackOverflow annual survey respondents as their most loved language for seven straight years performs compared to the above implementations? Full disclosure, this was my first project in Rust and shouldn't be considered an optimal solution. There is likely a lot of room for improvement. That being said, this implementation was the slowest to traverse the Google Cloud SDK at 0.4 seconds on average.
Despite being a language with a loyal and enthusiastic following, it consistently ranks low on the TIOBE index, but rises each year. I wanted to learn Rust after I started casually looking through The Rust Programming Language. I had not previously encountered documentation for a language that was user friendly, descriptive, and packed with examples. If the cited book isn't enough, there is also Rust by Example, Rustlings, and Rust by Practice. I agree with the popular opinion that Rust has a steep learning curve, but the ample learning resources which are written to be helpful for someone with even no prior programming experience make learning Rust an efficient and enjoyable process.
Below is the implementation, if you are reading this and see room for performance improvements, let me know here.
main.rs
file contains only the
code
concerned with initial execution and possible termination of the
program.
lib.rs
file.
main.rs
file contains only a main()
function; this
is
idiomatic according to Rust documentation.
main()
gathers command line arguments and passes them to a
Config
implementation, described in detail below.
Config
struct (used to
contain program configuration - in this case the specified path),
an error message will be printed to std::err
and the program will exit with
a
non-zero exit code.
// main.rs
fn main() {
let start = Instant::now();
// variable to store arguments passed at program launch
// user should only pass in file_path to analyze a dir recursively
let args: Vec = env::args().collect();
// bind arguments to a Config struct
let config = Config::build(&args).unwrap_or_else(|err| {
eprintln!("Could not parse arguments: {err}");
process::exit(1);
});
// begin program using provided configuration
if let Err(e) = hello_rust::run(config) {
eprintln!("Fatal Error: {e}");
process::exit(1);
}
let duration = start.elapsed();
println!("\nProgram completed in {:?} seconds", duration.as_secs_f32());
}
// program completes in about 0.4 seconds on average
Config
struct is also defined that will hold only one member, a string
describing the specified path to traverse.
Config
struct contains only one
associated
function: build()
.
main()
and
using them to create
a Config
struct.
// lib.rs
// define how many entries should be output
const NUM_ENTRIES: usize = 10;
/*
Config struct used to hold the string representation of the directory to recursively analyze
*/
pub struct Config {
root_path: String,
}
impl Config {
pub fn build(args: &[String]) -> Result {
if args.len() == 1 {
// no argument passed at program invocation, default to current working directory
let cwd = env::current_dir().unwrap_or_else(|error| {
panic!("Unable to infer current working director, try passing a path as program argument. {:?}", error);
});
let path = cwd.into_os_string().into_string().unwrap_or_else(|error| {
panic!("Unable to infer current working director, try passing a path as program argument. {:?}", error);
});
println!("No path provided as argument, defaulting to current working directory.");
Ok(Config { root_path: String::from(path) })
} else if args.len() > 2 {
return Err("Expected only one argument");
} else {
let root_path = args[1].clone();
Ok(Config { root_path })
}
}
}
Results
struct is defined that will contain one member: a HashMap
containing
filenames as keys and file sizes as values.
run()
function is responsible for calling the search()
function which is described below.
Ok(())
call at the end of the function.
// lib.rs
/*
Results struct to hold a HashMap with file paths as keys and bytes of files as values
*/
//#[derive(Debug)]
pub struct Results {
result_map: HashMap,
}
/*
Accepts Config struct as argument in order to specify the directory to analyze
*/
pub fn run(config: Config) -> Result<(), Box> {
println!("Searching for {0} largest entries in {1}:\n", NUM_ENTRIES, config.root_path);
let results = search(&config.root_path)?;
// sort result_map by value (descending)
let mut sort_vec: Vec<(&String, &u64)> = results.result_map.iter().collect();
sort_vec.sort_by(|a, b| b.1.cmp(a.1));
// Print the largest ten files found in the specified directory
if sort_vec.len() > NUM_ENTRIES {
for i in 0..NUM_ENTRIES {
println!("{:?}", sort_vec[i]);
}
} else {
for i in 0..sort_vec.len() {
println!("{:?}", sort_vec[i]);
}
}
Ok(())
}
search()
function expects as argument a string describing the path of
the
directory to traverse.
Result
(not to be confused with a Results
struct) is
returned.
Result
type can be unwrapped to its successful value, in this case a
Results
struct, or else contain an error message.
Results
struct and passes it to the
visit_dirs()
function where it will be updated
to contain information about filesystem entries.
/*
Converts the string representation of a directory to a file path object and an instance
of a Results struct. Passes both to visit_dirs().
*/
pub fn search(path: &str) -> Result {
let root_path = Path::new(path);
let mut results = Results {
result_map: HashMap::::new(),
};
match visit_dirs(root_path, &mut results) {
Err(e) => eprintln!("Error calling visit_dirs() from search(): {:?}", e),
_ => (),
}
Ok(results)
}
visit_dir()
function expects arguments describing the path to the
specified
directory
and a mutable Results
struct.
fs::read_dir()
method with a given path and passes
the
resulting entries
to the add_entry()
function described below.
/*
Visit each file system entry in the specified directory and if it is a file, will call add_entry()
passing the entry as an argument. Otherwise, if an entry is a directory, it will be entered and searched
in a recursive manner.
*/
pub fn visit_dirs(dir: &Path, results: &mut Results) -> io::Result<()> {
if dir.is_dir() {
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
visit_dirs(&path, results)?;
} else {
match add_entry(&entry, results) {
Err(e) => eprintln!("Error calling add_entry() from visit_dirs(): {:?}", e),
_ => (),
}
}
}
}
Ok(())
}
add_entries)
function is responsible for calling library functions to
determine
a given file's size.
Results
struct
member result_map
.
Results
struct is accessed within the
run()
function
to output the results of traversing the specified directory.
/*
For each file found, convert the file's path to a string representation and calculate the size in bytes
of the file. Insert these values into the results_map member of the Results struct.
*/
fn add_entry(entry: &DirEntry, results: &mut Results) -> io::Result<()> {
let path = entry.path();
let metadata = path.symlink_metadata()?;
let size = path.size_on_disk_fast(&metadata)?;
let str_path = path.to_str().unwrap_or("Unknown File");
results.result_map.insert(str_path.to_string(), size);
Ok(())
}