Line of best fit using least squares method

So you have 10 datapoints and you want to find the correlation between the lot. There is a method to determine the line. It is called 'regression analysis using least squares'. There's a good page about it on wikipedia. If you speak and read Dutch I can recommend the books by 'Wijvekate' about the subject. The original title was 'Verklarende statistiek' and it may be translated in english or german.

For all datapoints several summings are performed:

• Sx = sum of all x values
• Sy = sum of all y values
• Sxy = sum of all x * y values (for each datapair)
• Sxsq = sum of all 'x squared' values
• Sysq = sum of all 'y squared' values
Based on these sums, it is quite easy to determine the best fitting line 'y = a.x + b' where 'a' is the slope and 'b' is the intercept (value of 'y' for 'x=0').
```b = (Sxy - Sx.Sy/n) / (Sxsq - n.(Sx/n)^2)
a = Sy/n - b.Sx/n
```
Those are easy to do calculations and summations. This makes the program quite simple. The only thing, as usual, is dealing with odd datalines, comment lines and end of file.

Bestfit, the source

Below is the sourcecode for bestfit1.mod but since 'mod' sed to be a sound file format for an old obscure computer your browser will try to play the file, which will not work. So you either use right mouseclick 'Save as' or 'shift-left-mousebutton' to immediately download the file.

```MODULE bestfit1;

(*	Best fitting line using Least Squares Fit

Formulas:	a = sy/n - b.sx/n
b = (sxy - sx.sy/n) / (sxsq - n.(sx/n)^2)

ver	date	does
---	-----	-----------------------------------------------------------------
0	15-04	Nutting but not crashing too much
Version 0 has difficulties with empty lines and long lines with garbage
1	16-04	Improved empty line handling
Correlation coefficinet added according to Wijvekate pg 189

*)

IMPORT	Args, Conv, Err, Out, Files, Math;

CONST	AsciiLF		= 0AX;
AsciiTAB	= 09X;
AsciiCR		= 0DX;

VAR	samples, col1, col2,
line			: INTEGER;
sx, sy, sxy, sxsq,
sysq, x, y		: REAL;
inF			: Files.File;
linebuffer		: ARRAY 1024 OF CHAR;
eof			: BOOLEAN;

PROCEDURE SummIt (x, y : REAL);

BEGIN
sx := sx + x;
sy := sy + y;
sxy := sxy + x * y;
sxsq := sxsq + x * x;
sysq := sysq + y * y;
INC (samples)
END SummIt;

PROCEDURE CutIt (buf	: ARRAY OF CHAR;   p1, p2	  : INTEGER;   VAR x1, x2	  : REAL) : BOOLEAN;

VAR	done, i, n, p	: INTEGER;
ch		: CHAR;
str		: ARRAY 32 OF CHAR;

BEGIN
n := 0;
p := 0;
done := 0;
ch := buf ;
REPEAT
INC (p);
WHILE  ch = ' ' DO		(*  Eliminate whitespace	*)
INC (n);
ch := buf [n];
IF  ch = 0X  THEN  RETURN FALSE  END
END;
i := 0;
REPEAT			(*  Extract a real number	*)
ch := buf [n];
INC (n);
IF  ch = 0X  THEN  RETURN FALSE  END;
str [i] := ch;
INC (i)
UNTIL ch = ' ';
str [n] := 0X;		(*  Terminate string		*)
IF  p = p1  THEN  x1 := Conv.RealVal (str);  INC (done)  END;
IF  p = p2  THEN  x2 := Conv.RealVal (str);  INC (done)  END
UNTIL done = 2;
RETURN TRUE
END CutIt;

PROCEDURE GetLine (VAR  buffer : ARRAY OF CHAR);

VAR	i	: INTEGER;
ch	: CHAR;

BEGIN
LOOP
i := 0;
REPEAT
IF  ch = AsciiCR   THEN  ch := ' '  END;
IF  ch = AsciiTAB  THEN		(*   Replace TAB with three spaces		*)
buffer [i] := ' ';	INC (i);
buffer [i] := ' ';	INC (i);
buffer [i] := ' '
ELSE
buffer [i] := ch
END;
INC (i);
IF  Files.Eof (inF)  THEN
eof := TRUE;
EXIT
END
UNTIL  ch = AsciiLF;
buffer [i] := ' ';	INC (i);	(*   Make sure 0X is never reached in normal records	*)
buffer [i] := ' ';	INC (i);
buffer [i] := 0X;
IF  i > 10  THEN  EXIT  END		(*   Take care of empty lines	*)
END
END GetLine;

PROCEDURE CalcIt;

VAR	a, b, r, n	: REAL;

BEGIN
IF  samples > 3  THEN
n := samples;
b := (sxy - sx * sy / n) / (sxsq - sx * sx / n);
a := (sy - b * sx) / n;
r := (sxy - sx * sy / n) / Math.Sqrt ((sxsq - sx * sx / n) * (sysq - sy * sy / n));
Out.Ln;
Out.String ("Line of best fit is f(x) = ");
Out.Real (b);
Out.String (" x + ");
Out.Real (a);
Out.String (" with a correlation of ");
Out.Fixed (100 * r, 3, 1);
Out.Char ('%');
Out.Ln
ELSE
Out.Int (samples, 1);
Out.String (" samples is not enough data")
END;
Out.Ln
END CalcIt;

PROCEDURE Init;

VAR	option		: ARRAY 64 OF CHAR;

BEGIN
samples := 0;		sx := 0;	sy := 0;	sxy := 0;
sxsq := 0;		inF := NIL;	line := 1;	eof := FALSE;
Out.Ln;
IF  Args.argc < 2  THEN
Err.String ("Usage : 'bestfit datafile [col1 col2]'.");
Err.Ln;
Err.String ("If you omit 'col1 col2' the positions 1 and 2 are assumed for 'x' and 'y' respectively.");
Err.Ln
END;
Args.GetArg (1, option);
inF := Files.Open (option, "r");
IF  inF = NIL  THEN
Err.String ("Cannot open file ");	Err.String (option);	Err.String (". Aborting");
Err.Ln;
HALT (2)
END;
IF  Args.argc > 2  THEN
Args.GetArg (2, option);	col1 := Conv.IntVal (option);
Args.GetArg (3, option);	col2 := Conv.IntVal (option)
ELSE
col1 := 1;			col2 := 2
END
END Init;

(*
PROCEDURE Dumpit;

BEGIN
Out.String ("sx   = ");	Out.Real (sx);		Out.Ln;
Out.String ("sy   = ");	Out.Real (sy);		Out.Ln;
Out.String ("sxy  = ");	Out.Real (sxy);		Out.Ln;
Out.String ("sxsq = ");	Out.Real (sxsq);	Out.Ln;
Out.String ("sysq = ");	Out.Real (sysq);	Out.Ln;
Out.String ("samples = ");	Out.Int (samples, 2);	Out.Ln
END Dumpit;
*)

PROCEDURE ShutDown;

BEGIN
IF  inF # NIL  THEN  Files.Close (inF)  END
END ShutDown;

BEGIN
Init;
WHILE  eof = FALSE  DO
GetLine (linebuffer);
IF  linebuffer  # '#'  THEN
IF  CutIt (linebuffer, col1, col2, x, y) = TRUE  THEN
Out.Real (x); Out.Char (09X); Out.Real (y); Out.Ln;
SummIt (x, y)
ELSE
IF  eof = FALSE  THEN
Err.String ("Error processing line:");	Err.Ln;
Err.String (linebuffer);			Err.Ln;
Err.String ("Aborting at line ");		Err.Int (line, 3);		Err.Ln;
HALT (3)
END
END
END;
INC (line)
END;
CalcIt;
ShutDown;
END bestfit1.
```
Compile the program, as usual, with
`obc -o bestfit1 bestfit1.mod`
The program accepts 1 or 3 arguments:
• 1 : only the datafilename
• 3 : first the datafile, then col1 (column to find the 'x' value), then col2 (column to find the 'y' value)
Issuing the command "./bestfit1 pols 2 3" produces another line than "./bestfit1 pols 3 2". In the latter case, the values of x and y are reversed. The program reads the datafile line by line. The value at column 'col1' is treated as the 'x' value and the value at column 'col2' is treated as the 'y' value.

The correlation factor

Determining the line is nice. But how tight is the fit of the real data on the bestfitting line? For this, the correlation (co-relation) factor 'r' comes in handy. r is defined as:

`r := (sxy - sx * sy / n) / Math.Sqrt ((sxsq - sx * sx / n) * (sysq - sy * sy / n))`
There are many formula's on the net that look similar but deliver the wrong results. This formula, taken from the Wijvekate book, finds the correct value for 'r'. I express 'r' as a percentage. '1' or '100%' is the perfect value.

See it run

I constructed a test data set:

```#	Bestfit test data file

#	Straight line, y = 2x + 3
#
#	x	y

1	5
2	7
4	11
0	3
0.5	4
-1	1

```
Containing comments, empty lines and lines just with some spaces. 'r' should be 100%. Let's see what it does:
```jan@fluor:~/Oberon/bestfit\$ ./bestfit1 slop

1.00000 	5.00000
2.00000 	7.00000
4.00000        11.00000
0.00000 	3.00000
0.50000         4.00000
-1.00000		1.00000

Line of best fit is f(x) = 2.00000 x + 3.00000 with a correlation of 100.0%
```
I'm a happy man.

Page created 16 April 2019,