Line of best fit using least squares method

So you have 10 datapoints and you want to find the correlation between the lot. There is a method to determine the line. It is called 'regression analysis using least squares'. There's a good page about it on wikipedia. If you speak and read Dutch I can recommend the books by 'Wijvekate' about the subject. The original title was 'Verklarende statistiek' and it may be translated in english or german.

For all datapoints several summings are performed:

Based on these sums, it is quite easy to determine the best fitting line 'y = a.x + b' where 'a' is the slope and 'b' is the intercept (value of 'y' for 'x=0').
b = (Sxy - Sx.Sy/n) / (Sxsq - n.(Sx/n)^2)
a = Sy/n - b.Sx/n
   
Those are easy to do calculations and summations. This makes the program quite simple. The only thing, as usual, is dealing with odd datalines, comment lines and end of file.

Bestfit, the source

Below is the sourcecode for bestfit1.mod but since 'mod' sed to be a sound file format for an old obscure computer your browser will try to play the file, which will not work. So you either use right mouseclick 'Save as' or 'shift-left-mousebutton' to immediately download the file.

MODULE bestfit1;

(*	Best fitting line using Least Squares Fit
	
	Formulas:	a = sy/n - b.sx/n
			b = (sxy - sx.sy/n) / (sxsq - n.(sx/n)^2)
	
	ver	date	does
	---	-----	-----------------------------------------------------------------
	 0	15-04	Nutting but not crashing too much
	 		Version 0 has difficulties with empty lines and long lines with garbage
	 1	16-04	Improved empty line handling
	 		Correlation coefficinet added according to Wijvekate pg 189
	 
	 *)

IMPORT	Args, Conv, Err, Out, Files, Math;


CONST	AsciiLF		= 0AX;
	AsciiTAB	= 09X;
	AsciiCR		= 0DX;


VAR	samples, col1, col2,
	line			: INTEGER;
	sx, sy, sxy, sxsq, 
	sysq, x, y		: REAL;
	inF			: Files.File;
	linebuffer		: ARRAY 1024 OF CHAR;
	eof			: BOOLEAN;


PROCEDURE SummIt (x, y : REAL);

BEGIN
  sx := sx + x;
  sy := sy + y;
  sxy := sxy + x * y;
  sxsq := sxsq + x * x;
  sysq := sysq + y * y;
  INC (samples)
END SummIt;


PROCEDURE CutIt (buf	: ARRAY OF CHAR;   p1, p2	  : INTEGER;   VAR x1, x2	  : REAL) : BOOLEAN;

VAR	done, i, n, p	: INTEGER;
	ch		: CHAR;
	str		: ARRAY 32 OF CHAR;

BEGIN
  n := 0;
  p := 0;
  done := 0;
  ch := buf [0];
  REPEAT
    INC (p);
    WHILE  ch = ' ' DO		(*  Eliminate whitespace	*)
      INC (n); 
      ch := buf [n];
      IF  ch = 0X  THEN  RETURN FALSE  END
    END;
    i := 0;
    REPEAT			(*  Extract a real number	*)
      ch := buf [n];
      INC (n); 
      IF  ch = 0X  THEN  RETURN FALSE  END;
      str [i] := ch;
      INC (i)
    UNTIL ch = ' ';
    str [n] := 0X;		(*  Terminate string		*)
    IF  p = p1  THEN  x1 := Conv.RealVal (str);  INC (done)  END;
    IF  p = p2  THEN  x2 := Conv.RealVal (str);  INC (done)  END
  UNTIL done = 2;
  RETURN TRUE
END CutIt;


PROCEDURE GetLine (VAR  buffer : ARRAY OF CHAR);

VAR	i	: INTEGER;
	ch	: CHAR;

BEGIN
  LOOP
    i := 0;
    REPEAT
      Files.ReadChar (inF, ch);
      IF  ch = AsciiCR   THEN  ch := ' '  END;
      IF  ch = AsciiTAB  THEN		(*   Replace TAB with three spaces		*)
	buffer [i] := ' ';	INC (i);
	buffer [i] := ' ';	INC (i);
	buffer [i] := ' '
      ELSE
	buffer [i] := ch
      END;
      INC (i);
      IF  Files.Eof (inF)  THEN  
	eof := TRUE;
	EXIT  
      END
    UNTIL  ch = AsciiLF;
    buffer [i] := ' ';	INC (i);	(*   Make sure 0X is never reached in normal records	*)
    buffer [i] := ' ';	INC (i);
    buffer [i] := 0X;
    IF  i > 10  THEN  EXIT  END		(*   Take care of empty lines	*)
  END
END GetLine;


PROCEDURE CalcIt;

VAR	a, b, r, n	: REAL;

BEGIN
  IF  samples > 3  THEN
    n := samples;
    b := (sxy - sx * sy / n) / (sxsq - sx * sx / n);
    a := (sy - b * sx) / n;
    r := (sxy - sx * sy / n) / Math.Sqrt ((sxsq - sx * sx / n) * (sysq - sy * sy / n));
    Out.Ln;
    Out.String ("Line of best fit is f(x) = ");
    Out.Real (b);
    Out.String (" x + ");
    Out.Real (a);
    Out.String (" with a correlation of "); 
    Out.Fixed (100 * r, 3, 1);
    Out.Char ('%');
    Out.Ln
  ELSE
    Out.Int (samples, 1);
    Out.String (" samples is not enough data")
  END;
  Out.Ln
END CalcIt;


PROCEDURE Init;

VAR	option		: ARRAY 64 OF CHAR;

BEGIN
  samples := 0;		sx := 0;	sy := 0;	sxy := 0;
  sxsq := 0;		inF := NIL;	line := 1;	eof := FALSE;
  Out.Ln;
  IF  Args.argc < 2  THEN
    Err.String ("Usage : 'bestfit datafile [col1 col2]'.");
    Err.Ln;
    Err.String ("If you omit 'col1 col2' the positions 1 and 2 are assumed for 'x' and 'y' respectively.");
    Err.Ln
  END;
  Args.GetArg (1, option);
  inF := Files.Open (option, "r");
  IF  inF = NIL  THEN
    Err.String ("Cannot open file ");	Err.String (option);	Err.String (". Aborting");
    Err.Ln;
    HALT (2)
  END;
  IF  Args.argc > 2  THEN
    Args.GetArg (2, option);	col1 := Conv.IntVal (option);
    Args.GetArg (3, option);	col2 := Conv.IntVal (option)
  ELSE
    col1 := 1;			col2 := 2
  END
END Init;

(*
PROCEDURE Dumpit;

BEGIN
  Out.String ("sx   = ");	Out.Real (sx);		Out.Ln;
  Out.String ("sy   = ");	Out.Real (sy);		Out.Ln;
  Out.String ("sxy  = ");	Out.Real (sxy);		Out.Ln;
  Out.String ("sxsq = ");	Out.Real (sxsq);	Out.Ln;
  Out.String ("sysq = ");	Out.Real (sysq);	Out.Ln;
  Out.String ("samples = ");	Out.Int (samples, 2);	Out.Ln
END Dumpit;
*)

PROCEDURE ShutDown;

BEGIN
  IF  inF # NIL  THEN  Files.Close (inF)  END
END ShutDown;


BEGIN
  Init;
  WHILE  eof = FALSE  DO
    GetLine (linebuffer);
    IF  linebuffer [0] # '#'  THEN
      IF  CutIt (linebuffer, col1, col2, x, y) = TRUE  THEN  
        Out.Real (x); Out.Char (09X); Out.Real (y); Out.Ln;
	SummIt (x, y)
      ELSE
        IF  eof = FALSE  THEN
	  Err.String ("Error processing line:");	Err.Ln;
	  Err.String (linebuffer);			Err.Ln;
	  Err.String ("Aborting at line ");		Err.Int (line, 3);		Err.Ln;
	  HALT (3)
	END
      END
    END;
    INC (line)
  END;
  CalcIt;
  ShutDown;
END bestfit1.
   
Compile the program, as usual, with
obc -o bestfit1 bestfit1.mod
The program accepts 1 or 3 arguments: Issuing the command "./bestfit1 pols 2 3" produces another line than "./bestfit1 pols 3 2". In the latter case, the values of x and y are reversed. The program reads the datafile line by line. The value at column 'col1' is treated as the 'x' value and the value at column 'col2' is treated as the 'y' value.

The correlation factor

Determining the line is nice. But how tight is the fit of the real data on the bestfitting line? For this, the correlation (co-relation) factor 'r' comes in handy. r is defined as:

r := (sxy - sx * sy / n) / Math.Sqrt ((sxsq - sx * sx / n) * (sysq - sy * sy / n))
There are many formula's on the net that look similar but deliver the wrong results. This formula, taken from the Wijvekate book, finds the correct value for 'r'. I express 'r' as a percentage. '1' or '100%' is the perfect value.

See it run

I constructed a test data set:

#	Bestfit test data file

#	Straight line, y = 2x + 3
#
#	x	y

	1	5
	2	7
	4	11
	0	3
	0.5	4
	-1	1
   
   
   
Containing comments, empty lines and lines just with some spaces. 'r' should be 100%. Let's see what it does:
jan@fluor:~/Oberon/bestfit$ ./bestfit1 slop

	1.00000 	5.00000
	2.00000 	7.00000
	4.00000        11.00000
	0.00000 	3.00000
	0.50000         4.00000
       -1.00000		1.00000

Line of best fit is f(x) = 2.00000 x + 3.00000 with a correlation of 100.0%
   
I'm a happy man.

Page created 16 April 2019,