[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
troubles stopping ldm with ldmadmin on linux (fwd)
- Subject: troubles stopping ldm with ldmadmin on linux (fwd)
- Date: Thu, 18 May 2000 10:06:19 -0600 (MDT)
===============================================================================
Robb Kambic Unidata Program Center
Software Engineer III Univ. Corp for Atmospheric Research
address@hidden WWW: http://www.unidata.ucar.edu/
===============================================================================
---------- Forwarded message ----------
Date: Wed, 17 May 2000 13:17:14 -0600
From: Doug Hunt <address@hidden>
To: address@hidden
Subject: troubles stopping ldm with ldmadmin on linux
Hi all: I have recently been having troubles stopping LDM via 'ldmadmin
stop' on linux. The ldmadmin script seems to not check correctly if all
LDM kids are killed off. The result is that after an 'ldmadmin stop',
one must wait for a minute or so for all rpc.ldmd children to die. If
one tries 'ldmadmin start' during this time, it hangs...
I have made a small patch to 'ldmadmin' which seems to clean up this
problem. Instead of just killing off the rpc.ldmd process group leader,
it kills off all the kids too.
Attached is the new ldmadmin script.
Regards,
Doug Hunt
--
address@hidden
Software Engineer III
UCAR - COSMIC
Tel. (303) 497-2611
#!/bin/perl
#
# $Id: ldmadmin.in,v 1.45 1999/08/04 20:40:17 rkambic Exp $
#
# File: ldmadmin
#
# Copyright 1995 University Corporation for Atmospheric Research
# See ../COPYRIGHT file for copying and redistribution conditions.
#
# Description: This perl script provides a command line interface to LDM5
# programs.
#
# Files:
#
# $ldmhome/ldm.pid file containing process group ID
# /tmp/.ldmadmin.lck lock file for operations that modify the LDM
#
# Environment Variables:
#
# Usage:
#
# ldmadmin command [options] [conf_file]
#
# commands:
#
# start [-v] [-q queue_path]
# stop
# restart [-v] [-q queue_path]
# mkqueue [-v] [-c] [-q queue_path] [-s size]
# delqueue [-q queue_path]
# mksurfqueue [-v] [-c] [-q queue_path] [-s size]
# delsurfqueue [-q queue_path]
# newlog [-n numlogs] [-l logfile]
# dostats
# scour
# isrunning
# check [-t hours]
# pqactcheck [-p pqact.conf]
# pqactHUP
# queuecheck
# watch [-f feedset]
# ps
# config
# log
# tail
# clean
# usage
#
###############################################################################
#
# get the existing runtime environment
#
($os,$hostname,$version) = split(/ /,`uname -a`);
###############################################################################
# CONFIGURATION SECTION
###############################################################################
# the fully qualified hostname of the machine.
$hostname = "typhoon.cosmic.ucar.edu";
# set perl location
$perl = "/bin/perl";
# LDM home directory.
$ldmhome = "/usr/local/ldm";
# UDUNITS PATH
#
# If you will be running the gribtonc(1) decoder, you should set this path
# to the location of the udunits.dat file used by the udunits package. (i.e.
# $udunits = "/usr/local/etc/udunits.dat"). This should only need to be done
# if you installed the udunits library from the binary distribution, or if you
# built the udunits library from source code and moved the udunits.dat file
# to a different location afterwards.
#
#$udunits = "/usr/local/ldm/etc/udunits.dat";
# product queue size. By default this is set to 100MBytes. This should be
# sufficient size to hold one hours worth of data from the HRS,DDS,PPS,IDS,
# and MCIDAS data streams.
$pq_size = 100000000;
# product queue size for pqsurf. By defualt this is set to 2MBytes. You
# probably won't need to change this.
$surf_size = 2000000;
# defualt number of logs to rotate with the newlog command
$numlogs = 4;
# file paths - everything here is based on the ldmhome variable by default.
$bin_path = "$ldmhome/bin";
$etc_path = "$ldmhome/etc";
$log_path = "$ldmhome/logs";
$data_path = "$ldmhome/data";
$pq_path = "$data_path/ldm.pq";
$surf_path = "$data_path/pqsurf.pq";
# ldmadmin file locations and names
$pid_file = "$ldmhome/ldmd.pid";
$lock_file = "/tmp/.ldmadmin.lck";
$log_file = "$log_path/ldmd.log";
$ldmd_conf = "$etc_path/ldmd.conf";
$pqact_conf = "$etc_path/pqact.conf";
$scour_file = "$etc_path/scour.conf";
# set this to 0 if you don't want the ldm log files rotated whenever you
# start or restart the ldm.
$log_rotate = 1;
###############################################################################
# END OF CONFIGURATION - You should not need to change anything below this
# point.
###############################################################################
#
# we need added runtime stuff here
$progname = "ldmadmin";
# for feedhere
$ENV{'LDMHOSTNAME'} = "$hostname";
$ENV{'PATH'} =
"$bin_path:/bin:/usr/bin:/usr/sbin:/sbin:/usr/ucb:/usr/usb:/usr/etc:/etc:$ENV{'PATH'}";
# a few more parameters
$log_hours = 24;
$feedset = "ANY";
# set up the UDUNITS environment variable if needed
if (defined $udunits) {
$ENV{'UDUNITS'} = "$udunits";
}
# we want a flush after every print statement
$| = 1;
#
# get the command, error if no command specified
#
$_ = $ARGV[0];
shift;
$command = $_;
while ($_ = $ARGV[0]) {
shift;
/^([a-z]|[A-Z]|\/)/ && ($ldmd_conf = $_);
/^-q/ && ($q_path = shift);
/^-s/ && ($q_size = shift);
/^-c/ && $pq_clobber++;
/^-v/ && $verbose++;
/^-n/ && ($numlogs = shift);
/^-l/ && ($log_file = shift);
/^-t/ && ($log_hours = shift);
/^-f/ && ($feedset = shift);
/^-p/ && ($pqact_conf = shift);
}
if (!$command) {
print_usage();
}
#
# process the command request
#
if ($command eq "start") { # start the ldm
$status = start_ldm();
}
elsif ($command eq "stop") { # stop the ldm
$status = stop_ldm();
}
elsif ($command eq "restart") { # restart the ldm
$status = restart_ldm();
}
elsif ($command eq "mkqueue") { # create a product queue using pqcreate(1)
$status = make_pq();
}
elsif ($command eq "delqueue") { # delete a product queue
$status = delete_pq();
}
elsif ($command eq "mksurfqueue") { # create a product queue for pqsurf(1)
$status = make_surf_pq();
}
elsif ($command eq "delsurfqueue") { # delete a pqsurf product queue
$status = del_surf_pq();
}
elsif ($command eq "newlog") { # rotate the log files
make_lockfile();
$status = new_log();
rm_lockfile();
}
elsif ($command eq "dostats") { # mail stats to Unidata
`$perl $bin_path/mailpqstats -d $log_path -h $hostname`;
$status = $?;
}
elsif ($command eq "scour") { # scour data directories
`scour $scour_file`;
$status = $?;
}
elsif ($command eq "isrunning") { # check if the ldm is running
$status = check_running();
}
elsif ($command eq "check") { # analyze the log files
exec("$bin_path/ldmcheck -d $log_path");
}
elsif ($command eq "watch") { # monitor incoming products
exec("$bin_path/pqutil -f $feedset -w $pq_path");
}
elsif ($command eq "pqactcheck") { # check pqact file for errors
ldmadmin_pqactcheck();
}
elsif ($command eq "pqactHUP") { # HUP pqact
ldmadmin_pqactHUP();
}
elsif ($command eq "queuecheck") { # check queue for corruption
$status = ldmadmin_queuecheck();
}
elsif ($command eq "ps") { # get the ldm process information
ldmadmin_ps();
}
elsif ($command eq "config") { # show the ldm configuration
$status = ldm_config();
}
elsif ($command eq "log") { # do a more on the logfile
system("more","$log_file");
$status = $?;
}
elsif ($command eq "tail") { # do a tail -f on the logfile
system("tail","-f","$log_file");
$status = $?;
}
elsif ($command eq "clean") { # rm lockfile and ldmd.pid file
system("rm -f $lock_file $pid_file");
$status = $?;
}
elsif ($command eq "usage") { # print usage message
$status = print_usage();
}
else { # bad command
$status = print_usage();
}
#
# that's all folks
#
exit $status;
###############################################################################
# bad_exit error routine. Writes error to both stderr and via syslogd.
###############################################################################
sub bad_exit {
my($err_str) = @_;
my($date_str) = get_date();
# remove the lockfile if it exists
if (-e $lock_file) {
rm_lockfile();
}
# output to standard error
print STDERR "$date_str $hostname $progname[$<]: $err_str\n";
# exit with extreme prejudice
exit 1;
}
###############################################################################
# Date Routine. Gets data and time as GMT in the same format as the LDM log
# file.
###############################################################################
sub get_date {
@month_array = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec);
my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
gmtime(time());
my($date_string) =
sprintf("%s %d %02d:%02d:%02d UTC", $month_array[$mon], $mday,
$hour, $min,$sec);
return $date_string;
}
###############################################################################
# Print a usage message and exit. Should only be called when the command is
# usage, or command line arguments are bad or missing.
###############################################################################
sub print_usage {
print "\n$progname\n";
print "Usage: $progname command [options] [conf_file]";
print "\n\ncommands:";
print "\n\tstart [-v] [-q q_path]\t\t\tStart the LDM";
print "\n\tstop\t\t\t\t\tStop the LDM";
print "\n\trestart [-v] [-q q_path]\t\tRestart a running LDM";
print "\n\tmkqueue [-v] [-c] [-q q_path]\tCreate a product queue";
print "\n\tdelqueue [-q q_path]\t\t\tDelete a product queue";
print "\n\tmksurfqueue [-v] [-c] [-q q_path]";
print "\n\t\t\t\t\t\tCreate a product queue";
print "\n\t\t\t\t\t\tfor pqsurf";
print "\n\tdelsurfqueue [-q q_path]\t\tDelete a pqsurf product queue";
print "\n\tnewlog [-n numlogs] [-l logfile]\tRotate a log file";
print "\n\tdostats\t\t\t\t\tMail statistics to Unidata";
print "\n\tscour\t\t\t\t\tScour data directories";
print "\n\tisrunning\t\t\t\tExit status 0 if LDM is running,";
print "\n\t\t\t\t\t\t else exit 1";
print "\n\tcheck [-t hours]\t\t\tAnalyze the LDM log files";
print "\n\tpqactcheck [-p pqact_conf]\t\tCheck syntax for pqact files";
print "\n\tpqactHUP\t\t\t\t\tSend HUP signal to pqact program";
print "\n\tqueuecheck\t\tCheck for queue corruption";
print "\n\twatch [-f feedpat]\t\t\tMonitor incoming products";
print "\n\tps\t\t\t\t\tPrint LDM process information";
print "\n\tconfig\t\t\t\t\tPrint LDM configuration";
print "\n\tlog\t\t\t\t\tPage through the LDM log file";
print "\n\ttail\t\t\t\t\tMonitor the LDM log file";
print "\n\tclean\t\t\t\t\tRemoves lock and pid files";
print "\n\tusage\t\t\t\t\tThis message\n";
print "\n\noptions:";
print "\n\t-v\t\tTurn on verbose mode";
print "\n\t-c\t\tClobber an exisiting product queue";
print "\n\t-q q_path\tSpecify a product queue path";
print "\n\t\t\t Default $pq_path for LDM";
print "\n\t\t\t Default $surf_path for pqsurf";
print "\n\t-n numlogs\tNumber of logs to rotate";
print "\n\t\t\t Default $numlogs";
print "\n\t-l logfile\tName of logfile";
print "\n\t\t\t Default $log_file";
print "\n\t-t hours\tNumber of hours to apply to command";
print "\n\t\t\t Default $log_hours";
print "\n\t-f feedset\tFeed set to use with command";
print "\n\t\t\t Default $feedset";
print "\n\nconf_file:";
print "\n\twhich ldmd.conf file to use";
print "\n\t Default $ldmd_conf";
print "\n";
# force the exit
exit 1; # assumption is that this routine is called
# because of incorrect usage.
}
###############################################################################
# check for the existence of the lock file. Exit if found, create if not
# found.
###############################################################################
sub make_lockfile {
if (-e $lock_file) {
bad_exit("make_lockfile: another ldmadmin process exists");
}
open(LOCKFILE,">$lock_file") ||
bad_exit("make_lockfile: Can't open lock file $lock_file");
close(LOCKFILE);
}
###############################################################################
# remove a lock file. exit if not found.
###############################################################################
sub rm_lockfile {
if (-e $lock_file) {
unlink($lock_file);
}
else {
bad_exit("rm_lockfile: Lock file does not exist");
}
}
###############################################################################
# create a product queue
###############################################################################
sub make_pq {
# lock file check
make_lockfile();
# can't do this while there is a server running
if (!check_running()) {
bad_exit("make_pq: There is a server running, mkqueue aborted");
}
# set path and size if necessary
if ($q_path) {
$pq_path = $q_path;
}
# build the command line
$cmd_line = "pqcreate";
if ($verbose) {
$cmd_line .= " -v";
}
if ($pq_clobber) {
$cmd_line .= " -c";
}
$cmd_line .= " -q $pq_path -s $pq_size";
# execute pqcreate
`$cmd_line`;
if ($?) {
rm_lockfile();
bad_exit("make_pq: mkqueue failed");
return 1;
}
# remove the lockfile
rm_lockfile();
return 0;
}
###############################################################################
# delete a product queue - this needs to use isrunning once it is written
###############################################################################
sub delete_pq {
# lock file check
make_lockfile();
# check to see if the server is running. Exit if it is
if (!check_running()) {
bad_exit("delete_pq: A server is running, cannot delete the queue");
}
# check for queue_path
if ($q_path) {
$pq_path = $q_path;
}
# kill the queue
if (-e $pq_path) {
unlink($pq_path);
}
else {
bad_exit("delete_pq: $pq_path does not exist");
}
# remove the lock file
rm_lockfile();
return 0;
}
###############################################################################
# create a pqsurf product queue
###############################################################################
sub make_surf_pq {
# lock file check
make_lockfile();
# can't do this while there is a server running
if (!check_running()) {
bad_exit("make_surf_pq: There is a server running, mkqueue aborted");
}
# set path and size if necessary
if ($q_path) {
$surf_path = $q_path;
}
if ($q_size) {
$surf_size = $q_size;
}
# need the number of slots to create
$surf_slots = $surf_size / 1000000 * 6881;
# build the command line
$cmd_line = "pqcreate";
if ($verbose) {
$cmd_line .= " -v";
}
if ($pq_clobber) {
$cmd_line .= " -c";
}
$cmd_line .= " -S $surf_slots -q $surf_path -s $surf_size";
# execute pqcreate
`$cmd_line`;
$retval = $?;
# remove the lockfile
rm_lockfile();
return $retval;
}
###############################################################################
# delete a pqsurf product queue
###############################################################################
sub del_surf_pq {
# lock file check
make_lockfile();
# check to see if the server is running. Exit if it is
if (!check_running()) {
bad_exit("del_surf_pq: A server is running, cannot delete the queue");
}
# check for the queue path
if ($q_path) {
$surf_path = $q_path;
}
# kill the queue
if (-e $surf_path) {
unlink($surf_path);
}
else {
bad_exit("del_surf_pq: $surf_path does not exist");
}
# remove the lock file
rm_lockfile();
return 0;
}
###############################################################################
# start the LDM server
###############################################################################
sub start_ldm {
my($loopcount) = 1;
print "starting the LDM server...\n";
# create the lockfile
make_lockfile();
# make sure there is no other server running
if (!check_running()) {
bad_exit("start_ldm: There is another server running, start aborted");
}
if (!check_registered()) {
bad_exit("start_ldm: There is another server registered with the
portmapper, start aborted");
}
# make sure we have a product queue in place
if (!-e $pq_path) {
bad_exit("product queue, $pq_path, does not exist");
}
# if log_rotate is other than 0, rotate the ldm logs
if ($log_rotate) {
new_log();
}
# build the command line
$cmd_line = "rpc.ldmd";
if ($verbose) {
$cmd_line .= " -v";
}
$cmd_line .= " -q $pq_path $ldmd_conf > $pid_file";
`$cmd_line`;
# check to make sure things are running
do {
if($loopcount > 65) {
bad_exit("start_ldm: Server not started or registered.");
}
$loopcount++;
sleep($loopcount);
}
while(check_running() || check_registered());
print "the LDM server has been started\n";
# remove the lockfile
rm_lockfile();
return 0;
}
###############################################################################
# stop the LDM server
###############################################################################
sub stop_ldm {
my($loopcount) = 1;
print "stopping the LDM server...\n";
# create the lockfile
make_lockfile();
# handle linux better...
if ($os eq 'Linux') {
my @pids = split(' ', `pidof rpc.ldmd`);
foreach my $pid (@pids) {
system "kill $pid";
}
} else {
# get pid
$rpc_pid = getPid() ;
# kill the server and associated processes
system( "kill $rpc_pid" ) if( $rpc_pid != -1 ) ;
}
# we may need to sleep to make sure that the port is deregistered
# Beware the inverse logic of check_registered() and check_running()
do {
if($loopcount > 65) {
bad_exit("stop_ldm: Server not dead.");
}
$loopcount++;
sleep($loopcount);
}
while(!check_running() && !check_registered());
print "LDM server stopped\n";
# remove the lockfile
rm_lockfile();
# get rid of the pid file
unlink($pid_file);
return 0;
}
###############################################################################
# rotate the specified log file, keeping $numlog files
###############################################################################
sub new_log {
# rotate the log
`newlog $log_file $numlogs`;
# if rotation successful, notify syslogd
if ($?) {
bad_exit("new_log: log rotation failed");
}
else {
`hupsyslog`;
}
return 0;
}
###############################################################################
# print the LDM configuration information
###############################################################################
sub ldm_config {
print "\nhostname: $hostname\n";
print "ldmhome: $ldmhome\n";
print "bin path: $bin_path\n";
print "conf file: $ldmd_conf\n";
print "log file: $log_file\n";
print "data path: $data_path\n";
print "product queue: $pq_path\n";
print "queue size: $pq_size bytes\n";
print "PID file: $pid_file\n";
print "LDMHOSTNAME: $ENV{'LDMHOSTNAME'}\n";
print "PATH: $ENV{'PATH'}\n\n";
return 0;
}
###############################################################################
# check if the LDM is registered with the local portmapper.
# Return 0 if so, 1 if not.
###############################################################################
sub check_registered {
$rpcinfo_cmd = "rpcinfo -t localhost 300029";
`$rpcinfo_cmd 5 > /dev/null 2>&1`;
if($?) {
`$rpcinfo_cmd 4 > /dev/null 2>&1`;
if($?) {
return 1;
}
}
return 0;
}
###############################################################################
# check if the LDM is running. return 0 if running, 1 if not.
###############################################################################
sub check_running {
if ($os eq 'Linux') {
my @pids = split (' ', `pidof rpc.ldmd`);
return (@pids == 0);
}
my($pid_num, $ps_cmd);
$pid_num = getPid() ;
return 1 if( $pid_num == -1 ) ;
if (($os eq "SunOS" && $version =~ /^4/) || $os eq "Linux" ) {
$ps_cmd = "ps $pid_num | grep rpc.ldmd | grep -v grep";
} else {
$ps_cmd = "ps -p $pid_num | grep rpc.ldmd | grep -v grep";
}
`$ps_cmd`;
if ($?) {
return 1;
}
else {
return 0;
}
}
###############################################################################
# get PID number. return pid or -1
###############################################################################
sub getPid {
my( $i, @F, $pid_num ) ;
if (-e $pid_file) {
open(PIDFILE,"<$pid_file");
$pid_num = <PIDFILE>;
chomp( $pid_num );
close( PIDFILE ) ;
return $pid_num if( $pid_num =~ /^\d{1,6}/ ) ;
}
#the hard way
if ($os eq "SunOS" && $version =~ /^4/) {
open( IN, "ps -gawxl |" ) || bad_exit("ps: Cannot open ps");
$default = 3 ;
} elsif( $os eq "Linux") {
open( IN, "ps ajx |" ) || bad_exit("ps: Cannot open ps");
$default = 0 ;
} else {
open( IN, "ps -eaf |" ) || bad_exit("ps: Cannot open ps");
$default = 2 ;
}
# each platform has fields in different order, looking for PPID
$_ = <IN> ;
s/^\s*([A-Z].*)/\1/ ;
$index = -1 ;
( @F ) = split( /[ \t]+/, $_ ) ;
for( $i = 0; $i <= $#F; $i++ ) {
if( $F[ $i ] =~ /PPID/i ) {
$index = $i ;
last ;
}
}
$index = $default if( $index == -1 ) ;
@F = ( ) ;
# search through all processes, looking for parent of pqexpire
while( <IN> ) {
next unless( /pqexpire/ ) ;
# get parent of pqexpire
s/^\s*([a-z0-9].*)/\1/ ;
( @F ) = split( /[ \t]+/, $_ ) ;
last ;
}
close( IN ) ;
# no pid, no ldm running
return -1 if( $#F == -1 ) ;
return $F[ $index ] ;
}
###############################################################################
# restart the LDM, stopping it first if it is already running.
###############################################################################
sub restart_ldm {
stop_ldm();
sleep( 15 );
start_ldm();
}
###############################################################################
# list processes running under the current LDM process group
###############################################################################
sub ldmadmin_ps {
my( $gpid_num, $ps_cmd, $ps_output );
if (check_running()) {
bad_exit("ldmadmin_ps: no LDM server is running");
}
$gpid_num = getPid() ;
if ($os eq "SunOS" && $version =~ /^4/) {
$ps_cmd = "ps -agwxj | egrep $gpid_num";
} elsif( $os eq "Linux") {
$ps_cmd = "ps ajx | grep $gpid_num" ;
} else {
$ps_cmd = "ps -lfg $gpid_num";
}
$ps_output = `$ps_cmd`;
print "$ps_output\n";
}
###############################################################################
# Check the pqact.conf file for errors
###############################################################################
sub ldmadmin_pqactcheck {
my( @output ) ;
( @output ) = `pqact -vl - -q /dev/null $pqact_conf 2>&1` ;
if( $output[ 1 ] =~ /Successfully read/ ) {
print "$pqact_conf is syntactically correct\n" ;
} else {
print "$output[1]\n"
}
}
###############################################################################
# HUP the pqact program
###############################################################################
sub ldmadmin_pqactHUP {
if ($os eq "SunOS" && $version =~ /^4/) {
open( IN, "ps -gawxl |" ) || bad_exit("ps: Cannot open ps");
$default = 0 ;
} elsif( $os eq "Linux") {
open( IN, "ps ajx |" ) || bad_exit("ps: Cannot open ps");
$default = 1 ;
} else {
open( IN, "ps -eaf |" ) || bad_exit("ps: Cannot open ps");
$default = 1 ;
}
# each platform has fields in different order, looking for PID
$_ = <IN> ;
s/^\s*([A-Z].*)/\1/ ;
$index = -1 ;
( @F ) = split( /[ \t]+/, $_ ) ;
for( $i = 0; $i <= $#F; $i++ ) {
next if( $F[ $i ] =~ /PPID/i ) ;
if( $F[ $i ] =~ /PID/i ) {
$index = $i ;
last ;
}
}
$index = $default if( $index == -1 ) ;
@F = ( ) ;
# search through all processes, looking for pqact
while( <IN> ) {
next unless( /pqact/ ) ;
# get parent of pqexpire
s/^\s*([a-z0-9].*)/\1/ ;
( @F ) = split( /[ \t]+/, $_ ) ;
$pqactPid .= " $F[ $index ]" ;
}
close( IN ) ;
print "Check pqact HUP with command ldmadmin tail\n" ;
system( "kill -HUP $pqactPid" );
}
###############################################################################
# Check the queue file for errors
###############################################################################
sub ldmadmin_queuecheck {
`ulimit -f 0` ;
`pqcat -l - > /dev/null 2>&1` ;
if( $? ) {
return 1;
}
else {
return 0;
}
}