板凳-------Mysql cookbook学习（十一--------9)

13.2 分组描述统计

mysql> select age, count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by age;
+-----+---+------+---------+---------+--------+--------------------+--------------------+
| age | n | sum  | minimum | maximum | mean   | std. dev.          | variance           |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
|   5 | 4 |   22 |       4 |       7 | 5.5000 | 1.2909944487358056 | 1.6666666666666667 |
|   6 | 4 |   27 |       4 |       9 | 6.7500 |  2.217355782608345 |  4.916666666666667 |
|   7 | 4 |   30 |       6 |       9 | 7.5000 | 1.2909944487358056 | 1.6666666666666667 |
|   8 | 4 |   32 |       6 |      10 | 8.0000 | 1.8257418583505538 | 3.3333333333333335 |
|   9 | 4 |   35 |       7 |      10 | 8.7500 | 1.2583057392117918 |  1.583333333333334 |
+-----+---+------+---------+---------+--------+--------------------+--------------------+
5 rows in set (0.03 sec)mysql> select sex, count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by sex;
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| sex | n  | sum  | minimum | maximum | mean   | std. dev.          | variance           |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
| M   | 10 |   71 |       4 |       9 | 7.1000 | 1.7919573407620817 | 3.2111111111111112 |
| F   | 10 |   75 |       4 |      10 | 7.5000 | 1.9578900207451218 | 3.8333333333333335 |
+-----+----+------+---------+---------+--------+--------------------+--------------------+
2 rows in set (0.00 sec)mysql> select age, sex,  count(score) as n,-> sum(score) as sum,-> min(score) as minimum,-> max(score) as maximum,-> avg(score) as mean,-> stddev_samp(score) as 'std. dev.',-> var_samp(score) as 'variance'-> from testscore-> group by age, sex;
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
| age | sex | n | sum  | minimum | maximum | mean   | std. dev.          | variance |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
|   5 | M   | 2 |    9 |       4 |       5 | 4.5000 | 0.7071067811865476 |      0.5 |
|   5 | F   | 2 |   13 |       6 |       7 | 6.5000 | 0.7071067811865476 |      0.5 |
|   6 | M   | 2 |   17 |       8 |       9 | 8.5000 | 0.7071067811865476 |      0.5 |
|   6 | F   | 2 |   10 |       4 |       6 | 5.0000 | 1.4142135623730951 |        2 |
|   7 | M   | 2 |   14 |       6 |       8 | 7.0000 | 1.4142135623730951 |        2 |
|   7 | F   | 2 |   16 |       7 |       9 | 8.0000 | 1.4142135623730951 |        2 |
|   8 | M   | 2 |   15 |       6 |       9 | 7.5000 | 2.1213203435596424 |      4.5 |
|   8 | F   | 2 |   17 |       7 |      10 | 8.5000 | 2.1213203435596424 |      4.5 |
|   9 | M   | 2 |   16 |       7 |       9 | 8.0000 | 1.4142135623730951 |        2 |
|   9 | F   | 2 |   19 |       9 |      10 | 9.5000 | 0.7071067811865476 |      0.5 |
+-----+-----+---+------+---------+---------+--------+--------------------+----------+
10 rows in set (0.00 sec)

13.3 产生频率分布

mysql> select score, count(score) as occurence-> from testscore group by score;
+-------+-----------+
| score | occurence |
+-------+-----------+
|     5 |         1 |
|     4 |         2 |
|     6 |         4 |
|     7 |         4 |
|     8 |         2 |
|     9 |         5 |
|    10 |         2 |
+-------+-----------+
7 rows in set (0.00 sec)mysql> select @n := count(score) from  testscore;
+--------------------+
| @n := count(score) |
+--------------------+
|                 20 |
+--------------------+
1 row in set, 1 warning (0.01 sec)mysql> select score, (count(score) * 100)/@n as percent-> from testscore group by score;
+-------+---------+
| score | percent |
+-------+---------+
|     5 |  5.0000 |
|     4 | 10.0000 |
|     6 | 20.0000 |
|     7 | 20.0000 |
|     8 | 10.0000 |
|     9 | 25.0000 |
|    10 | 10.0000 |
+-------+---------+
7 rows in set (0.00 sec)mysql> select score, repeat('*', count(score)) as occurrences-> from testscore group by score;
+-------+-------------+
| score | occurrences |
+-------+-------------+
|     5 | *           |
|     4 | **          |
|     6 | ****        |
|     7 | ****        |
|     8 | **          |
|     9 | *****       |
|    10 | **          |
+-------+-------------+
7 rows in set (0.00 sec)mysql> select @n := count(score) from  testscore;
+--------------------+
| @n := count(score) |
+--------------------+
|                 20 |
+--------------------+
1 row in set, 1 warning (0.00 sec)mysql> select score, repeat('*', (count(score)*100)/@n) as percent-> from testscore group by score;
+-------+---------------------------+
| score | percent                   |
+-------+---------------------------+
|     5 | *****                     |
|     4 | **********                |
|     6 | ********************      |
|     7 | ********************      |
|     8 | **********                |
|     9 | ************************* |
|    10 | **********                |
+-------+---------------------------+
7 rows in set (0.00 sec)mysql> drop table if exists ref;
Query OK, 0 rows affected (0.03 sec)mysql> create table ref(score int);
Query OK, 0 rows affected (0.04 sec)mysql> insert into ref(score)-> values(0), (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);
Query OK, 11 rows affected (0.02 sec)
Records: 11  Duplicates: 0  Warnings: 0mysql> select ref.score, count(testscore.score) as occurences-> from ref left join testscore on ref.score = testscore.score-> group by ref.score;
+-------+------------+
| score | occurences |
+-------+------------+
|     0 |          0 |
|     1 |          0 |
|     2 |          0 |
|     3 |          0 |
|     4 |          2 |
|     5 |          1 |
|     6 |          4 |
|     7 |          4 |
|     8 |          2 |
|     9 |          5 |
|    10 |          2 |
+-------+------------+
11 rows in set (0.00 sec)mysql> select ref.score, (count(testscore.score)*100)/@n as percent-> from ref left join testscore on ref.score = testscore.score-> group by ref.score;
+-------+---------+
| score | percent |
+-------+---------+
|     0 |  0.0000 |
|     1 |  0.0000 |
|     2 |  0.0000 |
|     3 |  0.0000 |
|     4 | 10.0000 |
|     5 |  5.0000 |
|     6 | 20.0000 |
|     7 | 20.0000 |
|     8 | 10.0000 |
|     9 | 25.0000 |
|    10 | 10.0000 |
+-------+---------+
11 rows in set (0.00 sec)

13.4 计数缺失值

mysql> -- 创建表
mysql> CREATE TABLE subject_scores (->   subject INT,->   score INT NULL-> );
Query OK, 0 rows affected (0.06 sec)mysql>
mysql> -- 插入示例数据
mysql> INSERT INTO subject_scores VALUES->   (1, 38), (2, NULL), (3, 47),->   (4, 82), (5, NULL), (6, 65),->   (7, 90), (8, 73), (9, NULL),->   (10, 55), (11, 68), (12, 79);
Query OK, 12 rows affected (0.01 sec)
Records: 12  Duplicates: 0  Warnings: 0mysql>
mysql> -- 查询
mysql> SELECT * FROM subject_scores ORDER BY subject;
+---------+-------+
| subject | score |
+---------+-------+
|       1 |    38 |
|       2 |  NULL |
|       3 |    47 |
|       4 |    82 |
|       5 |  NULL |
|       6 |    65 |
|       7 |    90 |
|       8 |    73 |
|       9 |  NULL |
|      10 |    55 |
|      11 |    68 |
|      12 |    79 |
+---------+-------+
12 rows in set (0.00 sec)mysql> SELECT->   COUNT(*) AS 'n (total)',->   COUNT(score) AS 'n (nonmissing)',->   COUNT(*) - COUNT(score) AS 'n (missing)',->   ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'-> FROM subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
|        12 |              9 |           3 |   25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)mysql> select count(*) as 'n (total)',-> count(score) as  'n (nonmissing)',-> sum(isnull(score)) as 'n (missing)',-> (sum(isnull(score))* 100) / count(*) as '% missing'-> from subject_scores;
+-----------+----------------+-------------+-----------+
| n (total) | n (nonmissing) | n (missing) | % missing |
+-----------+----------------+-------------+-----------+
|        12 |              9 |           3 |   25.0000 |
+-----------+----------------+-------------+-----------+
1 row in set (0.00 sec)如果您确实需要按某些条件分组：
需要先确定分组依据。例如，如果：
•	科目1-6是A组
•	科目7-12是B组
可以这样写：
sql
mysql> SELECT->   CASE WHEN subject BETWEEN 1 AND 6 THEN 'A' ELSE 'B' END AS group_name,->   COUNT(*) AS 'n (total)',->   COUNT(score) AS 'n (nonmissing)',->   COUNT(*) - COUNT(score) AS 'n (missing)',->   ((COUNT(*) - COUNT(score)) * 100) / COUNT(*) AS '% missing'-> FROM subject_scores-> GROUP BY group_name;
+------------+-----------+----------------+-------------+-----------+
| group_name | n (total) | n (nonmissing) | n (missing) | % missing |
+------------+-----------+----------------+-------------+-----------+
| A          |         6 |              4 |           2 |   33.3333 |
| B          |         6 |              5 |           1 |   16.6667 |
+------------+-----------+----------------+-------------+-----------+
2 rows in set (0.00 sec)

13.5 计算线性回归和相关系数

mysql> select age, score from testscore;
+-----+-------+
| age | score |
+-----+-------+
|   5 |     5 |
|   5 |     4 |
|   5 |     6 |
|   5 |     7 |
|   6 |     8 |
|   6 |     9 |
|   6 |     4 |
|   6 |     6 |
|   7 |     8 |
|   7 |     6 |
|   7 |     9 |
|   7 |     7 |
|   8 |     9 |
|   8 |     6 |
|   8 |     7 |
|   8 |    10 |
|   9 |     9 |
|   9 |     7 |
|   9 |    10 |
|   9 |     9 |
+-----+-------+
20 rows in set (0.00 sec)mysql> SELECT->   @n := COUNT(score) AS n,->   @meanx := AVG(age) AS 'x mean',->   @sumx := SUM(age) AS 'x sum',->   @sumxx := SUM(age * age) AS 'x sum of squares',->   @meany := AVG(score) AS 'y mean',->   @sumy := SUM(score) AS 'y sum',->   @sumyy := SUM(score * score) AS 'y sum of squares',->   @sumxy := SUM(age * score) AS 'x*y sum'-> FROM testscore\G
*************************** 1. row ***************************n: 20x mean: 7.0000x sum: 140
x sum of squares: 1020y mean: 7.3000y sum: 146
y sum of squares: 1130x*y sum: 1053
1 row in set, 8 warnings (0.00 sec)mysql> select-> @b := (@n * @sumxy - @sumx * @sumy)/ (@n * @sumxx - @sumx * @sumx)-> as slope;
+-------------+
| slope       |
+-------------+
| 0.775000000 |
+-------------+
1 row in set, 1 warning (0.00 sec)mysql> select @a := (@meany - @b * @meanx) as intercept;
+----------------------+
| intercept            |
+----------------------+
| 1.875000000000000000 |
+----------------------+
1 row in set, 1 warning (0.00 sec)mysql> select concat('y =', @b, 'x + ', @a) as 'least-squares regression';
+----------------------------------------+
| least-squares regression               |
+----------------------------------------+
| y =0.775000000x + 1.875000000000000000 |
+----------------------------------------+
1 row in set (0.00 sec)mysql> select-> (@n * @sumxy - @sumx * @sumy)-> /sqrt((@n * @sumxx - @sumx * @sumx) * (@n * @sumyy - @sumy * @sumy ))-> as correlation;
+--------------------+
| correlation        |
+--------------------+
| 0.6117362044219903 |
+--------------------+
1 row in set (0.00 sec)