# Purpose:
# This script is run daily from cron before the compaction script.
# On the same days the compaction script triggers, this script should also.
# On triggered days, this script should trim old files to
# bring disk usage below 60%.
# Options accepted:
#
# $1 = directory to cleanup, defaults to "/mnt/cassandra/data/myschema"
DATADIR=${1:-/mnt/cassandra/data/myschema}
## ____ Config Variables ____
# Attempt to delete filesets one by one until disk Use% is less than:
goal_percentage_less_than=60
## ____ No user serviceable parts below this line ____
if [[ ! -d ${DATADIR} ]] ; then
echo "Argument: ${DATADIR} directory does not exist. Aborting"
exit 1
fi
# Day of week (0=Sunday)
declare -r -i todays_dow=$(date +%w)
readonly -a days_of_week=( Sunday Monday Tuesday Wednesday Thursday Friday Saturday )
usage () {
echo "$0 <no options allowed yet>";
}
# we are not the compaction script, but if we were:
#
# if [[ ! -e /var/log/cassandra ]] ; then
# mkdir /var/log/cassandra
# elif [[ ! -d /var/log/cassandra ]] ; then
# echo "/var/log/cassandra is not a directory."
# exit 1
# fi
# learn and parse ring
declare -a ring=( $(/usr/bin/cassandra-nodetool -h 127.0.0.1 -p 12352 ring \
| grep ^[0-9] | cut -f1 -d\ ) )
declare -i ringsize=${#ring[@]}
# sanity check
if [[ ringsize -lt 1 ]] ; then
echo "count of ring nodes was less than 1. aborting."
exit 2
fi
declare -r this_hosts_ip=$(host $(hostname) | grep 'has address' | awk '{print $NF}')
echo "My hostname,IP is: $(hostname),${this_hosts_ip}"
if [[ -z ${this_hosts_ip} ]] ; then
echo "unable to convert local hostname into an IP address. aborting."
exit 3
fi
declare -r this_hosts_ip_regex=$(echo ${this_hosts_ip} | sed 's/\./\\./g')
# Sanity check, am I a member of this ring?
if [[ ! ${ring[@]} =~ ${this_hosts_ip_regex} ]] ; then
echo "Couldn't find myself (${this_hosts_ip} in ring: ${ring[@]}. aborting"
exit 4
fi
# In a list of zero-indexed nodes, which one am I?
let my_index=unset
for i in ${!ring[@]} ; do
[[ ${ring[i]} =~ ${this_hosts_ip_regex} ]] && {
my_index=$i
break
}
done
# Sanity check, enforce that we found our index:
[[ ${my_index} == "unset" ]] && exit 5
my_day_of_week=$(( ${my_index} % 7 ))
# Check for the case where I am the last node, but my
# day of week is Sunday (same as first node's).
# Choose (3=Wednesday) to avoid adjacent (0=Sunday).
# old way: let "! (( ringsize - 1 ) % 7)" && my_day_of_week=3
if [[ ${my_index} -eq $(( ${ringsize} - 1 )) && ${my_day_of_week} -eq 0 ]] ; then
my_day_of_week=3
fi
echo "I will compact on ${days_of_week[$my_day_of_week]}s."
echo "Today is ${days_of_week[$todays_dow]}."
# DO NOT SUBMIT
# uncommment for production:
if [[ ${my_day_of_week} -ne ${todays_dow} ]] ; then
echo Not our lucky day.
exit 0
fi
echo "It's our lucky day. BEGIN ***cleaning up older files***"
# Clean up oldest filesets until disk is less than 60% full.
cd ${DATADIR}
declare -a -r DFSTATS=( $(df -kP ${DATADIR} | tail -1) )
echo df reported capacity=${DFSTATS[4]/[%]/}%
echo Calculated Capacity=$(( ( 100 * ${DFSTATS[2]} ) / ${DFSTATS[1]} ))%
declare -a filesizes=() filenames=()
function load_filedata {
local size name
while read size name ; do
filesizes+=(${size})
filenames+=(${name})
done < <(ls -1krst ${1})
}
function current_fileset_size_sum {
local -i total
for i in ${filesizes[@]} ; do
total+=$i
done
echo "$total"
return $total
}
# Get all fileset numbers and put them in an array.
filesets_numbers_by_time=( $(ls -1kst | egrep -v ^total | awk -F- '{print $2}' | sort -n | uniq) )
declare -r filesets_count=${#filesets_numbers_by_time[@]}
# Add each fileset's filesizes into one value.
# Note: this creates a sparse array.
for i in ${filesets_numbers_by_time[@]} ; do
filesizes=()
filenames=()
load_filedata "*-${i}-*"
filesets_sizes[$i]=$(current_fileset_size_sum)
# echo "set filesets_sizes[$i] to $(current_fileset_size_sum)"
done
function load_oldest_fileset_data {
load_filedata "*-${filesets_numbers_by_time[0]}-*"
return ${filesets_numbers_by_time[0]}
}
declare -i count_of_filesets_to_delete=0
declare -i expected_capacity=100 # Sane default
declare -i accumulated_deletes_in_kbytes=0
declare -a filesets_to_delete=()
# External variables modified by this function
# $expected_capacity
function current_expected_capacity {
local -i total_of_filesets=0
for i in ${filesets_to_delete[@]} ; do
total_of_filesets+=${filesets_sizes[$i]}
echo fileset # $i size is == ${filesets_sizes[$i]}
done
echo -n Calculated %-Capacity after removing filesets \"${filesets_to_delete[@]}\"= >/dev/stderr
expected_capacity=$(( ( 100 * ( ${DFSTATS[2]} - ${total_of_filesets} ) ) / ${DFSTATS[1]} ))
echo ${expected_capacity}% >/dev/stderr
return ${expected_capacity}
}
# External variables modified by this function
# $filesets_to_delete
# $filesets_numbers_by_time
# $count_of_filesets_to_delete
function add_oldest_fileset {
filesets_to_delete+=( ${filesets_numbers_by_time[0]} )
echo filesets_to_delete=${filesets_to_delete[@]}
# drop the oldest fileset from this list:
filesets_numbers_by_time=( ${filesets_numbers_by_time[@]:1} )
count_of_filesets_to_delete+=1
}
# sets initial value of $expected_capacity
current_expected_capacity
while [[ expected_capacity -gt ${goal_percentage_less_than} ]] ; do
add_oldest_fileset
current_expected_capacity
[[ count_of_filesets_to_delete -gt 3 ]] && {
echo "Planner thinks we need to delete more than 4 sets,"
echo "We might have a problem here... Aborting."
exit 6
}
done
# Check that we are not deleting more than half the filesets:
if [[ ${#filesets_to_delete[@]} -gt $(( filesets_count / 2 )) ]] ; then
echo -n "Plan is to delete too many filesets: "
echo "${#filesets_to_delete[@]} of ${filesets_count}"
echo Aborting
exit 7
fi
# do the deletes
echo If I was a real cleanup script, I would now delete:
for i in ${filesets_to_delete[@]} ; do
ls -l *-"${i}"-*
done