基于Hadoop生态圈的数据仓库实践 —— 环境搭建（三）笔记-白红宇

基于Hadoop生态圈的数据仓库实践 —— 环境搭建（三）笔记

阅读量：6535 次

发布时间：2019-06-24

本文共 6496 字，大约阅读时间需要 21 分钟。

-- 建立源库表

USE source;

-- 建立客户表

CREATE TABLE customer (

customer_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '客户编号，主键',

customer_name VARCHAR(50) comment '客户名称',

customer_street_address VARCHAR(50) comment '客户住址',

customer_zip_code INT comment '邮编',

customer_city VARCHAR(30) comment '所在城市',

customer_state VARCHAR(2) comment '所在省份' );

-- 建立产品表

CREATE TABLE product (

product_code INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '产品编码，主键',

product_name VARCHAR(30) comment '产品名称',

product_category VARCHAR(30) comment '产品类型' );

-- 建立销售订单表

CREATE TABLE sales_order (

order_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '订单号，主键',

customer_number INT comment '客户编号',

product_code INT comment '产品编码',

order_date DATETIME comment '订单日期',

entry_date DATETIME comment '登记日期',

order_amount DECIMAL(10 , 2 ) comment '销售金额',

foreign key (customer_number)

references customer (customer_number)

on delete cascade on update cascade,

foreign key (product_code)

references product (product_code)

on delete cascade on update cascade );

-- 新增客户数据

INSERT INTO customer (customer_name, customer_street_address, customer_zip_code, customer_city, customer_state) VALUES ('Really Large Customers', '7500 Louise Dr.',17050, 'Mechanicsburg','PA'), ('Small Stores', '2500 Woodland St.',17055, 'Pittsburgh','PA'), ('Medium Retailers','1111 Ritter Rd.',17055,'Pittsburgh','PA'), ('Good Companies','9500 Scott St.',17050,'Mechanicsburg','PA'), ('Wonderful Shops','3333 Rossmoyne Rd.',17050,'Mechanicsburg','PA'), ('Loyal Clients','7070 Ritter Rd.',17055,'Pittsburgh','PA'), ('Distinguished Partners','9999 Scott St.',17050,'Mechanicsburg','PA');

-- 生成产品表测试数据

INSERT INTO product (product_name, product_category ) VALUES ('Hard Disk Drive', 'Storage'), ('Floppy Drive', 'Storage'), ('LCD Panel', 'Monitor');

-- 生成100条销售订单表测试数据

DROP PROCEDURE IF EXISTS generate_sales_order_data;

delimiter //

DROP PROCEDURE IF EXISTS generate_sales_order_data;

create procedure generate_sales_order_data()

begin

DROP TABLE IF EXISTS temp_sales_order_data;

CREATE TABLE temp_sales_order_data AS SELECT * FROM sales_order WHERE 1=0;

SET @start_date := unix_timestamp('2016-03-01');

SET @end_date := unix_timestamp('2016-07-01');

SET @i := 1;

while @i<=100 do

SET @customer_number := floor(1 + rand() * 6);

SET @product_code := floor(1 + rand() * 2);

SET @order_date := from_unixtime(@start_date + rand() * (@end_date - @start_date));

SET @amount := floor(1000 + rand() * 9000);

INSERT INTO temp_sales_order_data VALUES (@i,@customer_number,@product_code,@order_date,@order_date,@amount);

SET @i:=@i+1;

END while;

TRUNCATE TABLE sales_order;

INSERT INTO sales_order SELECT NULL,customer_number,product_code,order_date,entry_date,order_amount FROM temp_sales_order_data ORDER BY order_date;

select * from sales_order;

COMMIT;

end//

delimiter ;

call generate_sales_order_data();

-- 建立RDS库表

USE rds;

-- 建立客户过渡表

CREATE TABLE customer ( customer_number INT comment 'number', customer_name VARCHAR(30) comment 'name', customer_street_address VARCHAR(30) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state');

-- 建立产品过渡表

CREATE TABLE product ( product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category' );

-- 建立销售订单过渡表

CREATE TABLE sales_order ( order_number INT comment 'order number', customer_number INT comment 'customer number', product_code INT comment 'product code', order_date TIMESTAMP comment 'order date', entry_date TIMESTAMP comment 'entry date', order_amount DECIMAL(10 , 2 ) comment 'order amount');

-- 建立数据仓库表

USE dw;

-- 建立客户维度表

CREATE TABLE customer_dim ( customer_sk INT comment 'surrogate key', customer_number INT comment 'number', customer_name VARCHAR(50) comment 'name', customer_street_address VARCHAR(50) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (customer_sk) INTO 8 BUCKETS STORED AS ORC ;

-- 建立产品维度表

CREATE TABLE product_dim ( product_sk INT comment 'surrogate key', product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (product_sk) INTO 8 BUCKETS STORED AS ORC;

-- 建立订单维度表

CREATE TABLE order_dim ( order_sk INT comment 'surrogate key', order_number INT comment 'number', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' )CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC ;

-- 建立销售订单事实表

CREATE TABLE sales_order_fact ( order_sk INT comment 'order surrogate key', customer_sk INT comment 'customer surrogate key', product_sk INT comment 'product surrogate key', order_date_sk INT comment 'date surrogate key', order_amount DECIMAL(10 , 2 ) comment 'order amount' )CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC;

#########date_dim_generate.sh########

date1="$1"

date2="$2"

tempdate=`date -d "$date1" +%F`

tempdateSec=`date -d "$date1" +%s`

enddateSec=`date -d "$date2" +%s`

min=1

max=`expr $ $enddateSec - $tempdateSec $ / $ 24 \* 60 \* 60 $ + 1`

cat /dev/null > ./date_dim.csv

#echo max

while [ $min -le $max ]

month=`date -d "$tempdate" +%m`