Три принципа опрятных данных, изложенных в статье Hadley Wickham:
import pandas as pd import numpy as np
Являются ли эти данные опрятными? Как выглядели бы опрятные данные?
df = pd.DataFrame({ "First": ["John", "Jane", "Mary"], "Last": ["Smith", "Doe", "Johnson"], "Treatment A": [np.nan, 16, 3], "Treatment B": [2, 11, 1] }) df
First | Last | Treatment A | Treatment B | |
---|---|---|---|---|
0 | John | Smith | NaN | 2 |
1 | Jane | Doe | 16.0 | 11 |
2 | Mary | Johnson | 3.0 | 1 |
Для перевода данных из широкого формата в длинных широко используется функция melt
. Предполагается, что в Вашей таблице имеются переменны идентификаторы, которые Вы не хотите менять, и переменные, которые Вы хотите трансформировать.
tidy = pd.melt( df, id_vars=['First', 'Last'], # identifiers # аргумент var_name задаёт имя колонки, которая будет содержать переменную, # значения которой соответствую значению бывших столбцов var_name='treatment', value_name='result') tidy
First | Last | treatment | result | |
---|---|---|---|---|
0 | John | Smith | Treatment A | NaN |
1 | Jane | Doe | Treatment A | 16.0 |
2 | Mary | Johnson | Treatment A | 3.0 |
3 | John | Smith | Treatment B | 2.0 |
4 | Jane | Doe | Treatment B | 11.0 |
5 | Mary | Johnson | Treatment B | 1.0 |
Почему так лучше? Например, можно безопасно удалять пропущенные значения:
tidy.dropna()
First | Last | treatment | result | |
---|---|---|---|---|
1 | Jane | Doe | Treatment A | 16.0 |
2 | Mary | Johnson | Treatment A | 3.0 |
3 | John | Smith | Treatment B | 2.0 |
4 | Jane | Doe | Treatment B | 11.0 |
5 | Mary | Johnson | Treatment B | 1.0 |
Метод melt работает следующим образом:
messy = pd.DataFrame({'row' : ['A', 'B', 'C'], 'a' : [1, 2, 3], 'b' : [4, 5, 6], 'c' : [7, 8, 9]}) messy
a | b | c | row | |
---|---|---|---|---|
0 | 1 | 4 | 7 | A |
1 | 2 | 5 | 8 | B |
2 | 3 | 6 | 9 | C |
pd.melt(messy, id_vars='row')
row | variable | value | |
---|---|---|---|
0 | A | a | 1 |
1 | B | a | 2 |
2 | C | a | 3 |
3 | A | b | 4 |
4 | B | b | 5 |
5 | C | b | 6 |
6 | A | c | 7 |
7 | B | c | 8 |
8 | C | c | 9 |
tidy = pd.melt(messy, id_vars='row', var_name='dimension', value_name='length') tidy
row | dimension | length | |
---|---|---|---|
0 | A | a | 1 |
1 | B | a | 2 |
2 | C | a | 3 |
3 | A | b | 4 |
4 | B | b | 5 |
5 | C | b | 6 |
6 | A | c | 7 |
7 | B | c | 8 |
8 | C | c | 9 |
Сделайте из опрятных данные неопрятные, используя знакомые Вам функции
messy1 = tidy.pivot(index='row',columns='dimension',values='length') messy1.reset_index(inplace=True) messy1.columns.name = '' messy1
row | a | b | c | |
---|---|---|---|---|
0 | A | 1 | 4 | 7 |
1 | B | 2 | 5 | 8 |
2 | C | 3 | 6 | 9 |
pew = pd.read_csv("pew.csv", sep=";")
pew
religion | <$10k | $10-20k | $20-30k | $30-40k | $40-50k | $50-75k | |
---|---|---|---|---|---|---|---|
0 | Agnostic | 27 | 34 | 60 | 81 | 76 | 137 |
1 | Atheist | 12 | 27 | 37 | 52 | 35 | 70 |
2 | Buddhist | 27 | 21 | 30 | 34 | 33 | 58 |
3 | Catholic | 418 | 617 | 732 | 670 | 638 | 1116 |
4 | Don't know/refused | 15 | 14 | 15 | 11 | 10 | 35 |
5 | Evangelical Prot | 575 | 869 | 1064 | 982 | 881 | 1486 |
6 | Hindu | 1 | 9 | 7 | 9 | 11 | 34 |
7 | Historically Black Prot | 228 | 244 | 236 | 238 | 197 | 223 |
8 | Jehovah's Witness | 20 | 27 | 24 | 24 | 21 | 30 |
9 | Jewish | 19 | 19 | 25 | 25 | 30 | 95 |
pew_tidy = pd.melt(pew, id_vars = ['religion'], var_name='income', value_name='freq') pew_tidy.sort_values(by=['religion'], inplace=True) pew_tidy
religion | income | freq | |
---|---|---|---|
0 | Agnostic | <$10k | 27 |
30 | Agnostic | $30-40k | 81 |
40 | Agnostic | $40-50k | 76 |
50 | Agnostic | $50-75k | 137 |
10 | Agnostic | $10-20k | 34 |
20 | Agnostic | $20-30k | 60 |
41 | Atheist | $40-50k | 35 |
21 | Atheist | $20-30k | 37 |
11 | Atheist | $10-20k | 27 |
31 | Atheist | $30-40k | 52 |
1 | Atheist | <$10k | 12 |
51 | Atheist | $50-75k | 70 |
42 | Buddhist | $40-50k | 33 |
2 | Buddhist | <$10k | 27 |
12 | Buddhist | $10-20k | 21 |
32 | Buddhist | $30-40k | 34 |
52 | Buddhist | $50-75k | 58 |
22 | Buddhist | $20-30k | 30 |
43 | Catholic | $40-50k | 638 |
33 | Catholic | $30-40k | 670 |
23 | Catholic | $20-30k | 732 |
53 | Catholic | $50-75k | 1116 |
13 | Catholic | $10-20k | 617 |
3 | Catholic | <$10k | 418 |
24 | Don't know/refused | $20-30k | 15 |
44 | Don't know/refused | $40-50k | 10 |
4 | Don't know/refused | <$10k | 15 |
54 | Don't know/refused | $50-75k | 35 |
14 | Don't know/refused | $10-20k | 14 |
34 | Don't know/refused | $30-40k | 11 |
25 | Evangelical Prot | $20-30k | 1064 |
55 | Evangelical Prot | $50-75k | 1486 |
15 | Evangelical Prot | $10-20k | 869 |
5 | Evangelical Prot | <$10k | 575 |
45 | Evangelical Prot | $40-50k | 881 |
35 | Evangelical Prot | $30-40k | 982 |
56 | Hindu | $50-75k | 34 |
26 | Hindu | $20-30k | 7 |
46 | Hindu | $40-50k | 11 |
16 | Hindu | $10-20k | 9 |
36 | Hindu | $30-40k | 9 |
6 | Hindu | <$10k | 1 |
47 | Historically Black Prot | $40-50k | 197 |
37 | Historically Black Prot | $30-40k | 238 |
57 | Historically Black Prot | $50-75k | 223 |
27 | Historically Black Prot | $20-30k | 236 |
17 | Historically Black Prot | $10-20k | 244 |
7 | Historically Black Prot | <$10k | 228 |
8 | Jehovah's Witness | <$10k | 20 |
38 | Jehovah's Witness | $30-40k | 24 |
58 | Jehovah's Witness | $50-75k | 30 |
48 | Jehovah's Witness | $40-50k | 21 |
28 | Jehovah's Witness | $20-30k | 24 |
18 | Jehovah's Witness | $10-20k | 27 |
29 | Jewish | $20-30k | 25 |
39 | Jewish | $30-40k | 25 |
19 | Jewish | $10-20k | 19 |
9 | Jewish | <$10k | 19 |
49 | Jewish | $40-50k | 30 |
59 | Jewish | $50-75k | 95 |
messy2 = pd.read_csv("tb_edited.csv")
messy2 = pd.melt(messy2, id_vars=['country', 'year'], value_name='cases') messy2.sort_values(by=['year', 'country'], inplace=True) messy2.head()
country | year | variable | cases | |
---|---|---|---|---|
0 | AD | 2000 | m014 | 0.0 |
201 | AD | 2000 | m1524 | 0.0 |
402 | AD | 2000 | m2534 | 1.0 |
603 | AD | 2000 | m3544 | 0.0 |
804 | AD | 2000 | m4554 | 0.0 |
messy2 = messy2[messy2['variable'] != 'mu'] def parse_age(s): s = s[1:] if s == '65': return '65+' else: return s[:-2] + '-' + s[-2:] messy2['sex'] = messy2['variable'].apply(lambda s: s[:1]) messy2['age'] = messy2['variable'].apply(parse_age) messy2 = messy2[['country', 'year', 'sex', 'age', 'cases']]
messy2.head()
country | year | sex | age | cases | |
---|---|---|---|---|---|
0 | AD | 2000 | m | 0-14 | 0.0 |
201 | AD | 2000 | m | 15-24 | 0.0 |
402 | AD | 2000 | m | 25-34 | 1.0 |
603 | AD | 2000 | m | 35-44 | 0.0 |
804 | AD | 2000 | m | 45-54 | 0.0 |
The most complicated form of messy data occurs when variables are stored in both rows and columns. Table 11 shows daily weather data from the Global Historical Climatology Network for one weather station (MX17004) in Mexico for five months in 2010. It has variables in individual columns (id, year, month), spread across columns (day, d1–d31) and across rows (tmin, tmax) (minimum and maximum temperature). Months with less than 31 days have structural missing values for the last day(s) of the month. The element column is not a variable; it stores the names of variables.
weather = pd.read_csv("https://raw.githubusercontent.com/tidyverse/tidyr/master/vignettes/weather.csv")
weather.head()
id | year | month | element | d1 | d2 | d3 | d4 | d5 | d6 | ... | d22 | d23 | d24 | d25 | d26 | d27 | d28 | d29 | d30 | d31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | MX17004 | 2010 | 1 | tmax | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 27.8 | NaN |
1 | MX17004 | 2010 | 1 | tmin | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 14.5 | NaN |
2 | MX17004 | 2010 | 2 | tmax | NaN | 27.3 | 24.1 | NaN | NaN | NaN | ... | NaN | 29.9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | MX17004 | 2010 | 2 | tmin | NaN | 14.4 | 14.4 | NaN | NaN | NaN | ... | NaN | 10.7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | MX17004 | 2010 | 3 | tmax | NaN | NaN | NaN | NaN | 32.1 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 35 columns
weather2 = pd.melt( weather, id_vars=['id', 'year', 'month', 'element'], var_name='day').dropna() weather2
id | year | month | element | day | value | |
---|---|---|---|---|---|---|
20 | MX17004 | 2010 | 12 | tmax | d1 | 29.9 |
21 | MX17004 | 2010 | 12 | tmin | d1 | 13.8 |
24 | MX17004 | 2010 | 2 | tmax | d2 | 27.3 |
25 | MX17004 | 2010 | 2 | tmin | d2 | 14.4 |
40 | MX17004 | 2010 | 11 | tmax | d2 | 31.3 |
41 | MX17004 | 2010 | 11 | tmin | d2 | 16.3 |
46 | MX17004 | 2010 | 2 | tmax | d3 | 24.1 |
47 | MX17004 | 2010 | 2 | tmin | d3 | 14.4 |
56 | MX17004 | 2010 | 7 | tmax | d3 | 28.6 |
57 | MX17004 | 2010 | 7 | tmin | d3 | 17.5 |
84 | MX17004 | 2010 | 11 | tmax | d4 | 27.2 |
85 | MX17004 | 2010 | 11 | tmin | d4 | 12.0 |
92 | MX17004 | 2010 | 3 | tmax | d5 | 32.1 |
93 | MX17004 | 2010 | 3 | tmin | d5 | 14.2 |
102 | MX17004 | 2010 | 8 | tmax | d5 | 29.6 |
103 | MX17004 | 2010 | 8 | tmin | d5 | 15.8 |
104 | MX17004 | 2010 | 10 | tmax | d5 | 27.0 |
105 | MX17004 | 2010 | 10 | tmin | d5 | 14.0 |
106 | MX17004 | 2010 | 11 | tmax | d5 | 26.3 |
107 | MX17004 | 2010 | 11 | tmin | d5 | 7.9 |
130 | MX17004 | 2010 | 12 | tmax | d6 | 27.8 |
131 | MX17004 | 2010 | 12 | tmin | d6 | 10.5 |
148 | MX17004 | 2010 | 10 | tmax | d7 | 28.1 |
149 | MX17004 | 2010 | 10 | tmin | d7 | 12.9 |
168 | MX17004 | 2010 | 8 | tmax | d8 | 29.0 |
169 | MX17004 | 2010 | 8 | tmin | d8 | 17.3 |
202 | MX17004 | 2010 | 3 | tmax | d10 | 34.5 |
203 | MX17004 | 2010 | 3 | tmin | d10 | 16.8 |
222 | MX17004 | 2010 | 2 | tmax | d11 | 29.7 |
223 | MX17004 | 2010 | 2 | tmin | d11 | 13.4 |
... | ... | ... | ... | ... | ... | ... |
324 | MX17004 | 2010 | 10 | tmax | d15 | 28.7 |
325 | MX17004 | 2010 | 10 | tmin | d15 | 10.5 |
334 | MX17004 | 2010 | 3 | tmax | d16 | 31.1 |
335 | MX17004 | 2010 | 3 | tmin | d16 | 17.6 |
362 | MX17004 | 2010 | 6 | tmax | d17 | 28.0 |
363 | MX17004 | 2010 | 6 | tmin | d17 | 17.5 |
486 | MX17004 | 2010 | 2 | tmax | d23 | 29.9 |
487 | MX17004 | 2010 | 2 | tmin | d23 | 10.7 |
498 | MX17004 | 2010 | 8 | tmax | d23 | 26.4 |
499 | MX17004 | 2010 | 8 | tmin | d23 | 15.0 |
542 | MX17004 | 2010 | 8 | tmax | d25 | 29.7 |
543 | MX17004 | 2010 | 8 | tmin | d25 | 15.6 |
568 | MX17004 | 2010 | 11 | tmax | d26 | 28.1 |
569 | MX17004 | 2010 | 11 | tmin | d26 | 12.1 |
578 | MX17004 | 2010 | 4 | tmax | d27 | 36.3 |
579 | MX17004 | 2010 | 4 | tmin | d27 | 16.7 |
580 | MX17004 | 2010 | 5 | tmax | d27 | 33.2 |
581 | MX17004 | 2010 | 5 | tmin | d27 | 18.2 |
590 | MX17004 | 2010 | 11 | tmax | d27 | 27.7 |
591 | MX17004 | 2010 | 11 | tmin | d27 | 14.2 |
610 | MX17004 | 2010 | 10 | tmax | d28 | 31.2 |
611 | MX17004 | 2010 | 10 | tmin | d28 | 15.0 |
626 | MX17004 | 2010 | 6 | tmax | d29 | 30.1 |
627 | MX17004 | 2010 | 6 | tmin | d29 | 18.0 |
630 | MX17004 | 2010 | 8 | tmax | d29 | 28.0 |
631 | MX17004 | 2010 | 8 | tmin | d29 | 15.3 |
638 | MX17004 | 2010 | 1 | tmax | d30 | 27.8 |
639 | MX17004 | 2010 | 1 | tmin | d30 | 14.5 |
674 | MX17004 | 2010 | 8 | tmax | d31 | 25.4 |
675 | MX17004 | 2010 | 8 | tmin | d31 | 15.4 |
66 rows × 6 columns
from datetime import datetime def f(row): return datetime(row['year'], row['month'], int(row['day'][1:])) weather3 = weather2.copy() weather3['date'] = weather2.apply(f, axis=1) weather3 = weather3[['id', 'element', 'value', 'date']] weather3
id | element | value | date | |
---|---|---|---|---|
20 | MX17004 | tmax | 29.9 | 2010-12-01 |
21 | MX17004 | tmin | 13.8 | 2010-12-01 |
24 | MX17004 | tmax | 27.3 | 2010-02-02 |
25 | MX17004 | tmin | 14.4 | 2010-02-02 |
40 | MX17004 | tmax | 31.3 | 2010-11-02 |
41 | MX17004 | tmin | 16.3 | 2010-11-02 |
46 | MX17004 | tmax | 24.1 | 2010-02-03 |
47 | MX17004 | tmin | 14.4 | 2010-02-03 |
56 | MX17004 | tmax | 28.6 | 2010-07-03 |
57 | MX17004 | tmin | 17.5 | 2010-07-03 |
84 | MX17004 | tmax | 27.2 | 2010-11-04 |
85 | MX17004 | tmin | 12.0 | 2010-11-04 |
92 | MX17004 | tmax | 32.1 | 2010-03-05 |
93 | MX17004 | tmin | 14.2 | 2010-03-05 |
102 | MX17004 | tmax | 29.6 | 2010-08-05 |
103 | MX17004 | tmin | 15.8 | 2010-08-05 |
104 | MX17004 | tmax | 27.0 | 2010-10-05 |
105 | MX17004 | tmin | 14.0 | 2010-10-05 |
106 | MX17004 | tmax | 26.3 | 2010-11-05 |
107 | MX17004 | tmin | 7.9 | 2010-11-05 |
130 | MX17004 | tmax | 27.8 | 2010-12-06 |
131 | MX17004 | tmin | 10.5 | 2010-12-06 |
148 | MX17004 | tmax | 28.1 | 2010-10-07 |
149 | MX17004 | tmin | 12.9 | 2010-10-07 |
168 | MX17004 | tmax | 29.0 | 2010-08-08 |
169 | MX17004 | tmin | 17.3 | 2010-08-08 |
202 | MX17004 | tmax | 34.5 | 2010-03-10 |
203 | MX17004 | tmin | 16.8 | 2010-03-10 |
222 | MX17004 | tmax | 29.7 | 2010-02-11 |
223 | MX17004 | tmin | 13.4 | 2010-02-11 |
... | ... | ... | ... | ... |
324 | MX17004 | tmax | 28.7 | 2010-10-15 |
325 | MX17004 | tmin | 10.5 | 2010-10-15 |
334 | MX17004 | tmax | 31.1 | 2010-03-16 |
335 | MX17004 | tmin | 17.6 | 2010-03-16 |
362 | MX17004 | tmax | 28.0 | 2010-06-17 |
363 | MX17004 | tmin | 17.5 | 2010-06-17 |
486 | MX17004 | tmax | 29.9 | 2010-02-23 |
487 | MX17004 | tmin | 10.7 | 2010-02-23 |
498 | MX17004 | tmax | 26.4 | 2010-08-23 |
499 | MX17004 | tmin | 15.0 | 2010-08-23 |
542 | MX17004 | tmax | 29.7 | 2010-08-25 |
543 | MX17004 | tmin | 15.6 | 2010-08-25 |
568 | MX17004 | tmax | 28.1 | 2010-11-26 |
569 | MX17004 | tmin | 12.1 | 2010-11-26 |
578 | MX17004 | tmax | 36.3 | 2010-04-27 |
579 | MX17004 | tmin | 16.7 | 2010-04-27 |
580 | MX17004 | tmax | 33.2 | 2010-05-27 |
581 | MX17004 | tmin | 18.2 | 2010-05-27 |
590 | MX17004 | tmax | 27.7 | 2010-11-27 |
591 | MX17004 | tmin | 14.2 | 2010-11-27 |
610 | MX17004 | tmax | 31.2 | 2010-10-28 |
611 | MX17004 | tmin | 15.0 | 2010-10-28 |
626 | MX17004 | tmax | 30.1 | 2010-06-29 |
627 | MX17004 | tmin | 18.0 | 2010-06-29 |
630 | MX17004 | tmax | 28.0 | 2010-08-29 |
631 | MX17004 | tmin | 15.3 | 2010-08-29 |
638 | MX17004 | tmax | 27.8 | 2010-01-30 |
639 | MX17004 | tmin | 14.5 | 2010-01-30 |
674 | MX17004 | tmax | 25.4 | 2010-08-31 |
675 | MX17004 | tmin | 15.4 | 2010-08-31 |
66 rows × 4 columns
weather4 = weather3.pivot(index='date', columns='element', values='value') weather4
element | tmax | tmin |
---|---|---|
date | ||
2010-01-30 | 27.8 | 14.5 |
2010-02-02 | 27.3 | 14.4 |
2010-02-03 | 24.1 | 14.4 |
2010-02-11 | 29.7 | 13.4 |
2010-02-23 | 29.9 | 10.7 |
2010-03-05 | 32.1 | 14.2 |
2010-03-10 | 34.5 | 16.8 |
2010-03-16 | 31.1 | 17.6 |
2010-04-27 | 36.3 | 16.7 |
2010-05-27 | 33.2 | 18.2 |
2010-06-17 | 28.0 | 17.5 |
2010-06-29 | 30.1 | 18.0 |
2010-07-03 | 28.6 | 17.5 |
2010-07-14 | 29.9 | 16.5 |
2010-08-05 | 29.6 | 15.8 |
2010-08-08 | 29.0 | 17.3 |
2010-08-13 | 29.8 | 16.5 |
2010-08-23 | 26.4 | 15.0 |
2010-08-25 | 29.7 | 15.6 |
2010-08-29 | 28.0 | 15.3 |
2010-08-31 | 25.4 | 15.4 |
2010-10-05 | 27.0 | 14.0 |
2010-10-07 | 28.1 | 12.9 |
2010-10-14 | 29.5 | 13.0 |
2010-10-15 | 28.7 | 10.5 |
2010-10-28 | 31.2 | 15.0 |
2010-11-02 | 31.3 | 16.3 |
2010-11-04 | 27.2 | 12.0 |
2010-11-05 | 26.3 | 7.9 |
2010-11-26 | 28.1 | 12.1 |
2010-11-27 | 27.7 | 14.2 |
2010-12-01 | 29.9 | 13.8 |
2010-12-06 | 27.8 | 10.5 |
weather3.groupby('id').apply(pd.DataFrame.pivot, index='date', columns='element', values='value').reset_index()
element | id | date | tmax | tmin |
---|---|---|---|---|
0 | MX17004 | 2010-01-30 | 27.8 | 14.5 |
1 | MX17004 | 2010-02-02 | 27.3 | 14.4 |
2 | MX17004 | 2010-02-03 | 24.1 | 14.4 |
3 | MX17004 | 2010-02-11 | 29.7 | 13.4 |
4 | MX17004 | 2010-02-23 | 29.9 | 10.7 |
5 | MX17004 | 2010-03-05 | 32.1 | 14.2 |
6 | MX17004 | 2010-03-10 | 34.5 | 16.8 |
7 | MX17004 | 2010-03-16 | 31.1 | 17.6 |
8 | MX17004 | 2010-04-27 | 36.3 | 16.7 |
9 | MX17004 | 2010-05-27 | 33.2 | 18.2 |
10 | MX17004 | 2010-06-17 | 28.0 | 17.5 |
11 | MX17004 | 2010-06-29 | 30.1 | 18.0 |
12 | MX17004 | 2010-07-03 | 28.6 | 17.5 |
13 | MX17004 | 2010-07-14 | 29.9 | 16.5 |
14 | MX17004 | 2010-08-05 | 29.6 | 15.8 |
15 | MX17004 | 2010-08-08 | 29.0 | 17.3 |
16 | MX17004 | 2010-08-13 | 29.8 | 16.5 |
17 | MX17004 | 2010-08-23 | 26.4 | 15.0 |
18 | MX17004 | 2010-08-25 | 29.7 | 15.6 |
19 | MX17004 | 2010-08-29 | 28.0 | 15.3 |
20 | MX17004 | 2010-08-31 | 25.4 | 15.4 |
21 | MX17004 | 2010-10-05 | 27.0 | 14.0 |
22 | MX17004 | 2010-10-07 | 28.1 | 12.9 |
23 | MX17004 | 2010-10-14 | 29.5 | 13.0 |
24 | MX17004 | 2010-10-15 | 28.7 | 10.5 |
25 | MX17004 | 2010-10-28 | 31.2 | 15.0 |
26 | MX17004 | 2010-11-02 | 31.3 | 16.3 |
27 | MX17004 | 2010-11-04 | 27.2 | 12.0 |
28 | MX17004 | 2010-11-05 | 26.3 | 7.9 |
29 | MX17004 | 2010-11-26 | 28.1 | 12.1 |
30 | MX17004 | 2010-11-27 | 27.7 | 14.2 |
31 | MX17004 | 2010-12-01 | 29.9 | 13.8 |
32 | MX17004 | 2010-12-06 | 27.8 | 10.5 |
columns = ['year','artist','track','time','date entered','wk1','wk2','wk3',] data = [[2000,"2,Pac","Baby Don't Cry","4:22","2000-02-26",87,82,72,], [2000,"2Ge+her","The Hardest Part Of ...","3:15","2000-09-02",91,87,92,], [2000,"3 Doors Down","Kryptonite","3:53","2000-04-08",81,70,68,], [2000,"98^0","Give Me Just One Nig...","3:24","2000-08-19",51,39,34,], [2000,"A*Teens","Dancing Queen","3:44","2000-07-08",97,97,96,], [2000,"Aaliyah","I Don't Wanna","4:15","2000-01-29",84,62,51,], [2000,"Aaliyah","Try Again","4:03","2000-03-18",59,53,38,], [2000,"Adams,Yolanda","Open My Heart","5:30","2000-08-26",76,76,74] ] messy = pd.DataFrame(data=data, columns=columns) messy
year | artist | track | time | date entered | wk1 | wk2 | wk3 | |
---|---|---|---|---|---|---|---|---|
0 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2000-02-26 | 87 | 82 | 72 |
1 | 2000 | 2Ge+her | The Hardest Part Of ... | 3:15 | 2000-09-02 | 91 | 87 | 92 |
2 | 2000 | 3 Doors Down | Kryptonite | 3:53 | 2000-04-08 | 81 | 70 | 68 |
3 | 2000 | 98^0 | Give Me Just One Nig... | 3:24 | 2000-08-19 | 51 | 39 | 34 |
4 | 2000 | A*Teens | Dancing Queen | 3:44 | 2000-07-08 | 97 | 97 | 96 |
5 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | 84 | 62 | 51 |
6 | 2000 | Aaliyah | Try Again | 4:03 | 2000-03-18 | 59 | 53 | 38 |
7 | 2000 | Adams,Yolanda | Open My Heart | 5:30 | 2000-08-26 | 76 | 76 | 74 |
molten = pd.melt(messy, id_vars=['year','artist','track','time','date entered'], var_name = 'week', value_name = 'rank', ) molten.sort_values(by=['date entered','week'], inplace=True) molten.head()
year | artist | track | time | date entered | week | rank | |
---|---|---|---|---|---|---|---|
5 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | wk1 | 84 |
13 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | wk2 | 62 |
21 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | wk3 | 51 |
0 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2000-02-26 | wk1 | 87 |
8 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2000-02-26 | wk2 | 82 |
molten['week'] = molten['week'].apply(lambda s: int(s[2:])) molten.head()
year | artist | track | time | date entered | week | rank | |
---|---|---|---|---|---|---|---|
5 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | 1 | 84 |
13 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | 2 | 62 |
21 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2000-01-29 | 3 | 51 |
0 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2000-02-26 | 1 | 87 |
8 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2000-02-26 | 2 | 82 |
from datetime import datetime, timedelta def increment_date(row): date = datetime.strptime(row['date entered'], "%Y-%m-%d") return date + timedelta(7) * (row['week'] - 1) molten['date'] = molten.apply(increment_date, axis=1) molten.drop('date entered', axis=1, inplace=True) molten.head()
year | artist | track | time | week | rank | date | |
---|---|---|---|---|---|---|---|
5 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 1 | 84 | 2000-01-29 |
13 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 2 | 62 | 2000-02-05 |
21 | 2000 | Aaliyah | I Don't Wanna | 4:15 | 3 | 51 | 2000-02-12 |
0 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 1 | 87 | 2000-02-26 |
8 | 2000 | 2,Pac | Baby Don't Cry | 4:22 | 2 | 82 | 2000-03-04 |
tidy_track = molten[['year','artist','track','time']]\ .groupby(['year','artist','track'])\ .first() tidy_track.reset_index(inplace=True) tidy_track.reset_index(inplace=True) tidy_track.rename(columns = {'index':'id'}, inplace=True) tidy_track
id | year | artist | track | time | |
---|---|---|---|---|---|
0 | 0 | 2000 | 2,Pac | Baby Don't Cry | 4:22 |
1 | 1 | 2000 | 2Ge+her | The Hardest Part Of ... | 3:15 |
2 | 2 | 2000 | 3 Doors Down | Kryptonite | 3:53 |
3 | 3 | 2000 | 98^0 | Give Me Just One Nig... | 3:24 |
4 | 4 | 2000 | A*Teens | Dancing Queen | 3:44 |
5 | 5 | 2000 | Aaliyah | I Don't Wanna | 4:15 |
6 | 6 | 2000 | Aaliyah | Try Again | 4:03 |
7 | 7 | 2000 | Adams,Yolanda | Open My Heart | 5:30 |
tidy_rank = pd.merge(molten, tidy_track, on='track') tidy_rank = tidy_rank[['id', 'date', 'rank']] tidy_rank.head()
id | date | rank | |
---|---|---|---|
0 | 5 | 2000-01-29 | 84 |
1 | 5 | 2000-02-05 | 62 |
2 | 5 | 2000-02-12 | 51 |
3 | 0 | 2000-02-26 | 87 |
4 | 0 | 2000-03-04 | 82 |
Комментарии